In [2]:
import pickle
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#CountVectorizer --> Bag of words
#TfidfVectorizer --> Tf-IDF
#gensim.models.Word2Vec --> word embeddings (word2vec)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Load the Data set

In [3]:
df = pd.read_csv('reviews.csv')
df.head()

Unnamed: 0,Id,Review,Label
0,0,good and interesting,5
1,1,"This class is very helpful to me. Currently, I...",5
2,2,like!Prof and TAs are helpful and the discussi...,5
3,3,Easy to follow and includes a lot basic and im...,5
4,4,Really nice teacher!I could got the point eazl...,4


### Data Cleaning

In [None]:
def clean_text(text):
    #substituite the irrelevant characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    #convert text to lower case
    text = text.lower()
    #tokenise the sentence in form of words
    tokens = word_tokenize(text)
    #set the stopwords for english language
    stop_words = set(stopwords.words('english'))
    #remove the stopwords
    filtered_tokens = [word  for word in tokens  if word not in stop_words]
    #applyign word net lemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word)   for word in filtered_tokens]
    cleaned_text = ' '.join(lemmatized_tokens)
    return cleaned_text

df['cleaned_reviews'] = df['Review'].apply(clean_text)

### Sentiment Mapping

In [6]:
sentiment_map = {1: "Negative", 2: "Negative", 3: "Neutral", 4: "Positive", 5: "Positive"}
df['sentiment'] = df['Label'].map(sentiment_map)
# save the dataframe to a new separate csv file
df.to_csv('preprocessed_data.csv') 
df.head()

Unnamed: 0,Id,Review,Label,cleaned_reviews,sentiment
0,0,good and interesting,5,good interesting,Positive
1,1,"This class is very helpful to me. Currently, I...",5,class helpful currently im still learning clas...,Positive
2,2,like!Prof and TAs are helpful and the discussi...,5,likeprof ta helpful discussion among student q...,Positive
3,3,Easy to follow and includes a lot basic and im...,5,easy follow includes lot basic important techn...,Positive
4,4,Really nice teacher!I could got the point eazl...,4,really nice teacheri could got point eazliy v,Positive
