In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt

In [3]:
import numpy as np

In [4]:
import string

In [5]:
import re

In [6]:
train_data = pd.read_csv("train_data.csv",names=["sentiments", "reviews"])

In [7]:
train_data.loc[0:5]

Unnamed: 0,sentiments,reviews
0,-1,"Eat at Fioris, they said. Youll like it, they..."
1,-1,I just don't understand the appeal. I've trie...
2,1,This is my go to place for a really good beef ...
3,-1,"Not impressed. When I ordered the Oyako bowl, ..."
4,-1,This is the first time ever I wrote a bad revi...
5,-1,I don't really mind dive places because there ...


In [8]:
def basic_cleaning(data_frame):
    data_frame['reviews'] = data_frame.reviews.apply(lambda review: review.lower())
    data_frame['reviews'] = data_frame.reviews.apply(lambda review: review.translate(str.maketrans('', '', string.punctuation)))
    data_frame['reviews'] = data_frame.reviews.apply(lambda review: review.translate(str.maketrans('', '', string.digits)))
    data_frame['reviews'] = data_frame.reviews.apply(lambda review: re.sub("r[^a-z]",'',review))

In [9]:
basic_cleaning(train_data)

In [10]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

def tokenize_data(data_frame):
    data_frame['words'] = data_frame.reviews.apply(lambda review: nltk.word_tokenize(review))

In [11]:
tokenize_data(train_data)

In [12]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

def remove_stopwords(list):
    stop_words_removed = []
    for i in list:
        if i not in stopwords:
            stop_words_removed.append(i)
    return stop_words_removed


In [13]:
train_data['stop_words_cleaned'] = train_data.words.apply(lambda word_list: remove_stopwords(word_list))

In [14]:
# nltk.download('all')
def tag_pos(list_of_words):
    return nltk.pos_tag(list_of_words)

In [15]:
#extraction of lemma words after taggin with pos
from nltk.stem import WordNetLemmatizer
lemmatizer= WordNetLemmatizer()
def find_lemma_word(word):
    lemma_words=[]
    words_with_pos = tag_pos(word)
    for word in words_with_pos:
        if word[1].startswith('NN'):
            lemma_words.append(lemmatizer.lemmatize(word[0],pos='n'))
        elif word[1].startswith('VB'):
            lemma_words.append(lemmatizer.lemmatize(word[0],pos='v'))
        elif word[1].startswith('JJ'):
            lemma_words.append(lemmatizer.lemmatize(word[0],pos='a'))
        elif word[1].startswith('RB'):
            lemma_words.append(lemmatizer.lemmatize(word[0],pos='r'))
        else:
            lemma_words.append(word[0])
            
    return lemma_words

In [16]:
train_data['lemma_word'] = train_data.stop_words_cleaned.apply(lambda word_list: find_lemma_word(word_list))

In [17]:
train_data['cleaned_review'] = train_data.lemma_word.apply(lambda review_list: " ".join(review_list))

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
count_vectorizer = TfidfVectorizer(use_idf=False)

In [19]:
feature_matrix = count_vectorizer.fit_transform(train_data["cleaned_review"])

In [20]:
len(count_vectorizer.vocabulary_)*.2

18414.0

In [21]:
cv_ignore_bottom = TfidfVectorizer(max_features=int(len(count_vectorizer.vocabulary_) * 0.25))

In [22]:
#training test split
from sklearn.model_selection import train_test_split
reviews_train, reviews_test, sentiments_train, sentiments_test = train_test_split(train_data.cleaned_review,train_data.sentiments,shuffle=True, random_state=0,stratify=train_data.sentiments,train_size=.90)  


In [23]:
sentiments_train.value_counts()

 1    8100
-1    8100
Name: sentiments, dtype: int64

In [24]:
train_feature_data = cv_ignore_bottom.fit_transform(reviews_train)
train_matrix = train_feature_data.toarray()
test_feature_data = cv_ignore_bottom.transform(reviews_test)
test_matrix = test_feature_data.toarray()

In [25]:
df= pd.DataFrame(data=train_matrix, columns=cv_ignore_bottom.get_feature_names_out())

In [26]:
df.head()

Unnamed: 0,aa,aaa,aabc,aaron,ab,aback,abacus,abandon,abbreviation,abc,...,zone,zoo,zoolights,zoom,zoos,zorbas,zucchini,zucchinis,zuchinni,zumba
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.271801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
classified_sentiments = pd.DataFrame()
classified_sentiments['positive_sentiments'] = sentiments_train.apply(lambda x:  'yes' if x == 1 else  'no')
classified_sentiments['negative_sentiments'] = sentiments_train.apply(lambda x:  'yes' if x == -1 else  'no')
classified_sentiments.head()

Unnamed: 0,positive_sentiments,negative_sentiments
12540,yes,no
16176,yes,no
2885,no,yes
11271,yes,no
10876,yes,no


In [56]:
from sklearn.feature_selection import mutual_info_classif

In [57]:
# mutual = mutual_info_classif()
chi_weights_pos = mutual_info_classif(train_matrix, classified_sentiments['positive_sentiments'].to_numpy())
weight_pos = np.array(chi_weights_pos[0])
weight_pos

KeyboardInterrupt: 

In [54]:
chi_weights_neg = mutual_info_classif(train_matrix, classified_sentiments['negative_sentiments'].to_numpy())
weight_neg = np.array(chi_weights_neg[0])
weight_neg

array([0.01988847, 0.18342051, 0.67961616, ..., 0.04288036, 0.03923125,
       0.25100732])

In [50]:
for i in range(0, len(weight_neg)):
    print(weight_neg[i] == weight_pos[i])
    

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [30]:
def assign_new_weight(records):
    changed_weight = []
    for record in records:
        changed_weight.append(np.multiply(record,weights[0]))
    
    return np.array(changed_weight)

In [31]:
weighted_features_train = assign_new_weight(train_matrix)

In [32]:
weighted_features_test = assign_new_weight(test_matrix)

In [33]:
weighted_features_test.shape

(1800, 23017)

In [34]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn import metrics

# for i in range(40,47):
#     classifer = KNeighborsClassifier(n_neighbors=i,n_jobs=3,weights='distance')
#     classifer.fit(weighted_features_train, sentiments_train)
#     sentiments_predict = classifer.predict(weighted_features_test)
#     score = metrics.accuracy_score(sentiments_test,sentiments_predict)
#     print(score)
#     track_precision.append(score)

In [269]:
result_test_df = pd.read_csv('1661892619_9579706_test_file.csv', names=['reviews'])

In [270]:
basic_cleaning(result_test_df)

In [271]:
tokenize_data(result_test_df)

In [272]:
result_test_df.head()

Unnamed: 0,reviews,words
0,got takeout from here last night and it was ho...,"[got, takeout, from, here, last, night, and, i..."
1,girls are sweet and prices are reasonable the ...,"[girls, are, sweet, and, prices, are, reasonab..."
2,rudest people i have eveencountered husband a...,"[rudest, people, i, have, eveencountered, husb..."
3,this airport is only coveted fothe destination...,"[this, airport, is, only, coveted, fothe, dest..."
4,the last months have shown a steady decline i...,"[the, last, months, have, shown, a, steady, de..."


In [273]:
result_test_df['stop_words_cleaned'] = result_test_df.words.apply(lambda word_list: remove_stopwords(word_list))

In [274]:
result_test_df['lemma_word'] = result_test_df.stop_words_cleaned.apply(lambda word_list: find_lemma_word(word_list))

In [275]:
result_test_df['cleaned_review'] = result_test_df.lemma_word.apply(lambda review_list: " ".join(review_list))

In [276]:
result_test_feature = cv_ignore_bottom.transform(result_test_df['cleaned_review'])

In [277]:
result_feature_array = result_test_feature.toarray()

In [278]:
weighted_result_features = assign_new_weight(result_feature_array)

In [279]:
weighted_result_features.shape

(18000, 23017)

In [280]:
score_predict = classifer.predict(weighted_result_features)

In [284]:
import csv
outfile = open('results.csv','w')
out = csv.writer(outfile)
out.writerows(map(lambda x: [x], score_predict))
outfile.close()

In [285]:
score_predict.shape

(18000,)

In [325]:
weights

(array([0.01673611, 0.17705036, 0.67704909, ..., 0.16845732, 0.1633091 ,
        0.2296403 ]),
 array([0.89706638, 0.67392046, 0.41060472, ..., 0.68148649, 0.686128  ,
        0.63179067]))

In [35]:
sentiment_df = pd.DataFrame(sentiments_train, columns=['sentiments'])
df= pd.DataFrame(data=feature_matrix, columns=cv_ignore_bottom.get_feature_names_out())

In [36]:
rows = df.columns.values
df['sentiments'] = np.array(sentiments_train)

In [37]:
# sentiments_train.value_counts()
arr = []
for i in rows:
    arr.append(i)
arr

['aa',
 'aaa',
 'aabc',
 'aaron',
 'ab',
 'aback',
 'abacus',
 'abandon',
 'abbreviation',
 'abc',
 'abduct',
 'abe',
 'abercrombie',
 'abide',
 'ability',
 'abita',
 'able',
 'abnormal',
 'abnormally',
 'abomination',
 'abortion',
 'abound',
 'abouti',
 'aboutnni',
 'aboutnnthe',
 'aboveaverage',
 'abrahigm',
 'abrasive',
 'abroad',
 'abrubt',
 'abrupt',
 'abruptly',
 'absence',
 'absent',
 'absentminded',
 'absolut',
 'absolute',
 'absolutely',
 'absolutly',
 'absorb',
 'absurd',
 'absurdity',
 'absurdly',
 'abt',
 'abundance',
 'abundant',
 'abuse',
 'abusive',
 'abysmal',
 'ac',
 'academic',
 'acapulco',
 'acc',
 'accent',
 'accentuate',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepts',
 'access',
 'accessibility',
 'accessible',
 'accessory',
 'accident',
 'accidentally',
 'accidently',
 'acclaim',
 'acclimate',
 'accolade',
 'accolades',
 'accommodate',
 'accommodation',
 'accomodate',
 'accomodated',
 'accomodating',
 'accompanied',
 'accompaniment',
 'accompany'

In [42]:
p_table = pd.pivot_table(df, index= [arr[0]],columns=['sentiments'],aggfunc='size')

In [43]:
p_table.head()

sentiments,-1,1
aa,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,8098.0,8099.0
0.067181,1.0,
0.174858,1.0,
0.350606,,1.0
