In [1]:
import re
import string
import os
import glob

import spacy
import nltk

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
from spacy.lang.en import English
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
spacy.load('en')
#download train data from http://ai.stanford.edu/~amaas/data/sentiment/

  from numpy.core.umath_tests import inner1d


<spacy.lang.en.English at 0x7fa1f3ab4ba8>

# Functions for Preprocessing

In [2]:
stop_words = set(stopwords.words('english'))
stop_words.update(['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'may', 'also', 'across','among', 'beside', 'however', 'yet', 'within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

In [3]:
def preprocess_reviews(reviews):
    ls_preprocessed_review=[]
    content=list(filter(lambda a: a != '\n', reviews))
    for item in content:
        text=item.lower()
        text=removeStopWords(text)
        text, _=re.subn('\s+', ' ',text)
        text = text.translate(str.maketrans('','',string.punctuation))
        text = text.strip()
        ls_preprocessed_review.append(text)
    return ls_preprocessed_review

# Load Data Train and Test and Apply Preprocessing Functions

In [4]:
reviews_train = []
for line in open('./Data/SentimentData/full_train.txt', 'r'):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('./Data/SentimentData/full_test.txt', 'r'):
    reviews_test.append(line.strip())

In [5]:
reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

# TF_IDF Vectorization

In [6]:
#IDF
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1, 1), norm='l2', min_df=2,max_df=0.6)
vectorizer .fit(reviews_train_clean)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.6, max_features=None, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [7]:
#TF * IDF (x train and x test)
X = vectorizer.transform(reviews_train_clean)
X_test = vectorizer.transform(reviews_test_clean)

# Train LogisticRegression and Test the model

In [8]:
#Binarization of labels of Target
target = [1 if i < 12500 else 0 for i in range(25000)]
#split data into train and test (80% train and "20% test")
X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.8)
#Accuracy for different "C"s in logarithm regression to find best number.
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s"  % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.8218
Accuracy for C=0.05: 0.842
Accuracy for C=0.25: 0.869
Accuracy for C=0.5: 0.8784
Accuracy for C=1: 0.8876


In [9]:
final_model = LogisticRegression(C=1)
final_model.fit(X, target)
print ("Final Accuracy: %s" % accuracy_score(target, final_model.predict(X_test)))
# Final Accuracy: 0.88128

Final Accuracy: 0.88408


# Train RandomForestClassifier

In [10]:
clf = RandomForestClassifier(n_jobs=-1, n_estimators=1000, random_state=42)
clf.fit(X, target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [11]:
#by comparing random forest and LogisticRegression: LogisticRegressin is picked
print ("Final Accuracy: %s"  % accuracy_score(target, clf.predict(X_test)))

Final Accuracy: 0.86552


# Top words for each class (Negative and Positive)

In [12]:
feature_to_coef = {word: coef for word, coef in zip(vectorizer.get_feature_names(), final_model.coef_[0])}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
print("========")   
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)


('great', 7.065434251864624)
('excellent', 6.21520540545708)
('best', 5.200642777036034)
('wonderful', 4.742939959011141)
('perfect', 4.668875024482069)
('worst', -9.156384652636044)
('bad', -7.589381675501876)
('waste', -6.648502562645551)
('awful', -6.404443767085823)
('boring', -5.549085868510076)


# Load data into Data Frame

In [13]:
rows=[]
allLine=[]
folder = "./Data/Chapters/"
filepaths = glob.glob(os.path.join(folder, '*.txt'))
for fp in filepaths:
    with open(fp, 'r') as f:
        lin = f.read()
        lines = lin.splitlines()
        sentimentall = final_model.predict_proba(vectorizer.transform(preprocess_reviews(lines)))
        neg = float(sum(d[0] for d in sentimentall)) / len(sentimentall)
        pos = float(sum(d[1] for d in sentimentall)) / len(sentimentall)
        rows.append( {'Chapter': re.search('(\w+).txt', f.name).group(1), 'Neg': neg , 'Pos':pos})
        
        for li in lines:
            sentimentli = final_model.predict_proba(vectorizer.transform(preprocess_reviews([li])))
            allLine.append( {'line': li,'Chapter': re.search('(\w+).txt', f.name).group(1), 'Neg': sentimentli[0][0] , 'Pos':sentimentli[0][1]})
dfChapter = pd.DataFrame(data=rows,dtype=float)
dflines = pd.DataFrame(data=allLine,dtype=float)

In [14]:
dflines.sort_values("Chapter",inplace=True)

In [15]:
dflines.reset_index(inplace=True)

# Apply Classifier on each Sentence

In [16]:
dflines["Positive"]=dflines["Pos"].apply(lambda x: x>0.5)

# Save Result for Sentences

In [17]:
dflines.to_csv("./Data/Senteces_Sentiment.csv",index=None)

# Decide Status (Positive or Negative) of each Chapter based on Sentences

In [18]:
chapters=dflines.groupby("Chapter")

In [19]:
list_chap=[]
for name,group in chapters:
    list_chap.append((name,str(group["Positive"].value_counts().ix[True]>group["Positive"].value_counts().ix[False])))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
Chapter_sentiment_df=pd.DataFrame(list_chap)

In [21]:
Chapter_sentiment_df.columns=["Chapter","posetive"]

In [22]:
Chapter_sentiment_df.to_csv("./Data/Chapters_Sentiment.csv",index=None)

In [23]:
Chapter_sentiment_df

Unnamed: 0,Chapter,posetive
0,chapter_0,True
1,chapter_1,True
2,chapter_10,False
3,chapter_11,True
4,chapter_12,False
5,chapter_13,True
6,chapter_14,True
7,chapter_15,False
8,chapter_16,False
9,chapter_17,True
