In [None]:
#imports
import pandas as pd
import os
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Set current working directory
cwd = os.getcwd()

# Read CSV file into a DataFrame
df = pd.read_csv(os.path.join(cwd, "BA_reviews.csv"))

# Clean 'verified' column
df['verified'] = df['reviews'].str.contains("Trip Verified")

# Clean 'reviews' column and create 'corpus'
lemma = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
corpus = []
for rev in df['reviews']:
    rev = re.sub('[^a-zA-Z]', ' ', rev)
    rev = rev.lower().split()
    rev = [lemma.lemmatize(word) for word in rev if word not in stop_words]
    rev = " ".join(rev)
    corpus.append(rev)
df['corpus'] = corpus

# Convert 'stars' column to numeric
df['stars'] = pd.to_numeric(df['stars'], errors='coerce')

# Drop rows with missing values in 'stars' or 'country' columns
df = df.dropna(subset=['stars', 'country'])

# Reset index
df = df.reset_index(drop=True)

# Export the cleaned data to a CSV file
df.to_csv(os.path.join(cwd, "cleaned-BA-reviews.csv"), index=False)
