In [None]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [None]:
#create a dataframe from csv file

cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)

In [None]:
df.head(10)

In [None]:
#We will also create a column which mentions if the user is verified or not.

df['verified'] = df.reviews.str.contains("Trip Verified")
df['verified']

Cleaning Reviews
We will extract the column of reviews into a separate dataframe and clean it for semantic analysis

In [None]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()


reviews_data = df.reviews.str.strip("âœ… Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [None]:
# add the corpus to the original dataframe

df['corpus'] = corpus

In [None]:
df.head(10)

In [None]:
#Cleaning/Fromat date
df.dtypes

In [None]:
# convert the date to datetime format

df.date = pd.to_datetime(df.date)

In [None]:
df.date.head()

In [None]:
#Cleaning ratings with stars
#check for unique values
df.stars.unique()

In [None]:
# remove the \t and \n from the ratings
df.stars = df.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [None]:
df.stars.value_counts()

In [None]:
# drop the rows where the value of ratings is None
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [None]:
#check the unique values again
df.stars.unique()

In [None]:
#Check for null Values
df.isnull().value_counts()

In [None]:
df.country.isnull().value_counts()

In [None]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [None]:
df.shape


In [None]:
#resetting the index
df.reset_index(drop=True)

In [None]:
# export the cleaned data

df.to_csv(cwd + "/cleaned-BA-reviews.csv")