In [27]:
import pandas as pd
import re
from textblob import TextBlob

In [4]:
df = pd.read_csv("./datasets/labeledTrainData.tsv", sep="\t")

In [5]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [20]:
def clean_punctuation(text):
    return re.match("[a-zA-Z .]*", text)

In [21]:
clean_punctuation("he film starts with a manager (Nicholas Bell)")[0]

'he film starts with a manager '

In [24]:
num_positive = df[df["sentiment"] == 1].shape[0]

In [23]:
num_negetive = df[df["sentiment"] == 0].shape[0]

In [25]:
num_positive, num_negetive

(12500, 12500)

In [26]:
df.shape

(25000, 3)

In [36]:
def perform_sentiment_analysis(text):
    testimonial = TextBlob(text)
    return 0 if testimonial.sentiment.polarity < 0 else 1

In [37]:
df["textblob_sentiment"] = df["review"].apply(perform_sentiment_analysis)

In [38]:
df.head()

Unnamed: 0,id,sentiment,review,textblob_sentiment
0,5814_8,1,With all this stuff going down at the moment w...,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,0
3,3630_4,0,It must be assumed that those who praised this...,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,0


In [39]:
from sklearn.metrics import accuracy_score

In [41]:
accuracy_score(df["sentiment"],df["textblob_sentiment"])

0.68524

In [48]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

In [54]:

def perform_sentiment_analysis_no_stowords(text):
    testimonial = TextBlob(text.lower())
    textblb = TextBlob(" ".join([word for word in testimonial.words if word not in stop_words]))
    return 0 if testimonial.sentiment.polarity < 0 else 1

In [56]:
df["textblob_sentiment_without_stopword"] = df["review"].apply(perform_sentiment_analysis_no_stowords)

In [57]:
df.head()

Unnamed: 0,id,sentiment,review,textblob_sentiment,textblob_sentiment_without_stopword
0,5814_8,1,With all this stuff going down at the moment w...,1,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",1,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,0,0
3,3630_4,0,It must be assumed that those who praised this...,1,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,0,0


In [58]:
accuracy_score(df["sentiment"],df["textblob_sentiment_without_stopword"])

0.68552