In [1498]:
import pandas as pd
import numpy as np
import string
import re

In [1499]:
train_data = pd.read_csv('1661892619_92027_train_file.csv',names=["sentiments", "reviews"])

### Text cleaning, tokenization and stopwords filtering

In [1500]:
def basic_cleaning(data_frame):
    data_frame['reviews'] = data_frame.reviews.apply(lambda review: review.lower())
    data_frame['reviews'] = data_frame.reviews.apply(lambda review: review.translate(str.maketrans('', '', string.punctuation)))
    data_frame['reviews'] = data_frame.reviews.apply(lambda review: review.translate(str.maketrans('', '', string.digits)))
    data_frame['reviews'] = data_frame.reviews.apply(lambda review: re.sub("r[^a-z]",'',review))

    
import nltk

def tokenize_data(data_frame):
    data_frame['words'] = data_frame.reviews.apply(lambda review: nltk.word_tokenize(review))

from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

def remove_stopwords(list):
    stop_words_removed = []
    for i in list:
        if i not in stopwords:
            stop_words_removed.append(i)
    return stop_words_removed

### Tagging Part of Speech and extraction of root words from tokenized words

In [1501]:
# nltk.download('all') {Depending on the situation, might want to UNCOMMENT this to use nltk pos_tag}
#tag part of speech to get more accurate word during lemmatizaton
def tag_pos(list_of_words):
    return nltk.pos_tag(list_of_words)

#extraction of lemma words after pos taggin 
from nltk.stem import WordNetLemmatizer
lemmatizer= WordNetLemmatizer()

def find_lemma_word(word):
    lemma_words=[]
    words_with_pos = tag_pos(word)
    for word in words_with_pos:
        if word[1].startswith('NN'):
            lemma_words.append(lemmatizer.lemmatize(word[0],pos='n'))
        elif word[1].startswith('VB'):
            lemma_words.append(lemmatizer.lemmatize(word[0],pos='v'))
        elif word[1].startswith('JJ'):
            lemma_words.append(lemmatizer.lemmatize(word[0],pos='a'))
        elif word[1].startswith('RB'):
            lemma_words.append(lemmatizer.lemmatize(word[0],pos='r'))
        else:
            lemma_words.append(word[0])
            
    return lemma_words

In [1502]:
basic_cleaning(train_data)
tokenize_data(train_data)
train_data['stop_words_cleaned'] = train_data.words.apply(lambda word_list: remove_stopwords(word_list))
train_data['lemma_word'] = train_data.stop_words_cleaned.apply(lambda word_list: find_lemma_word(word_list))
train_data['cleaned_review'] = train_data.lemma_word.apply(lambda review_list: " ".join(review_list))

In [1503]:
train_data

Unnamed: 0,sentiments,reviews,words,stop_words_cleaned,lemma_word,cleaned_review
0,-1,eat at fioris they said youll like it they sa...,"[eat, at, fioris, they, said, youll, like, it,...","[eat, fioris, said, youll, like, saidnnis, con...","[eat, fioris, say, youll, like, saidnnis, conv...",eat fioris say youll like saidnnis convenientl...
1,-1,i just dont understand the appeal ive tried t...,"[i, just, dont, understand, the, appeal, ive, ...","[dont, understand, appeal, ive, tried, place, ...","[dont, understand, appeal, ive, tried, place, ...",dont understand appeal ive tried place twice t...
2,1,this is my go to place foa really good beef en...,"[this, is, my, go, to, place, foa, really, goo...","[go, place, foa, really, good, beef, enchilada...","[go, place, foa, really, good, beef, enchilada...",go place foa really good beef enchilada red sa...
3,-1,not impressed when i ordered the oyako bowl th...,"[not, impressed, when, i, ordered, the, oyako,...","[impressed, ordered, oyako, bowl, conversation...","[impressed, order, oyako, bowl, conversation, ...",impressed order oyako bowl conversation go som...
4,-1,this is the first time evei wrote a bad review...,"[this, is, the, first, time, evei, wrote, a, b...","[first, time, evei, wrote, bad, review, frustr...","[first, time, evei, write, bad, review, frustr...",first time evei write bad review frustrate her...
...,...,...,...,...,...,...
17995,-1,i was referred to go to this place by a buddy ...,"[i, was, referred, to, go, to, this, place, by...","[referred, go, place, buddy, aftea, conversati...","[refer, go, place, buddy, aftea, conversation,...",refer go place buddy aftea conversation get sh...
17996,1,the food here was really good we started off ...,"[the, food, here, was, really, good, we, start...","[food, really, good, started, garlic, bread, c...","[food, really, good, start, garlic, bread, cov...",food really good start garlic bread cover toma...
17997,1,i eat at this place maybe a week i am die har...,"[i, eat, at, this, place, maybe, a, week, i, a...","[eat, place, maybe, week, die, hard, wing, fan...","[eat, place, maybe, week, die, hard, wing, fan...",eat place maybe week die hard wing fan best ev...
17998,1,phoenix airport is getting betteday by day i ...,"[phoenix, airport, is, getting, betteday, by, ...","[phoenix, airport, getting, betteday, day, pri...","[phoenix, airport, get, betteday, day, primari...",phoenix airport get betteday day primarily use...


### Train Test Split and volabulary extraction

In [1504]:
from sklearn.model_selection import train_test_split
train_review, test_review, train_sentiment, test_sentiment =train_test_split(train_data.cleaned_review,train_data.sentiments,shuffle=True,random_state=0,stratify=train_data.sentiments,train_size=.90)

In [1505]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vectorizer = TfidfVectorizer(
    min_df=.0095,  # only include words occuring in .95% of the document
    use_idf=False, # We don't need inerse doc frequency as we are inerested in only vocabulary extraction
    ngram_range=(1,2)) # this will consider only one and two words (combined) as features

feature_matrix = tf_vectorizer.fit_transform(train_review)
feature_array = feature_matrix.toarray()

In [1506]:
vocab_list = tf_vectorizer.get_feature_names_out()
vocab_list

array(['able', 'absolutely', 'accommodate', 'across', 'act', 'actually',
       'add', 'admit', 'afte', 'aftea', 'aftei', 'afternoon', 'aftethe',
       'ago', 'agree', 'ahead', 'airport', 'allow', 'almost', 'alone',
       'along', 'already', 'also', 'although', 'always', 'amaze',
       'amazing', 'ambiance', 'american', 'amount', 'annoy', 'anothe',
       'anyone', 'anything', 'anyway', 'anywhere', 'apologize',
       'apparently', 'appointment', 'appreciate', 'area', 'arent',
       'arizona', 'around', 'arrive', 'ask', 'ask foa', 'ate',
       'atmosphere', 'attention', 'attentive', 'attitude', 'authentic',
       'available', 'average', 'avoid', 'away', 'awesome', 'awful', 'az',
       'baand', 'back', 'bacon', 'bad', 'bag', 'bar', 'barely', 'base',
       'basic', 'basically', 'bathroom', 'bbq', 'bean', 'beat',
       'beautiful', 'become', 'bed', 'beef', 'beer', 'begin', 'behind',
       'believe', 'best', 'bettethan', 'beyond', 'big', 'bill',
       'birthday', 'bit', 'bite', 

### Select K-best features

In [1507]:
from sklearn.feature_selection import SelectKBest, chi2

vocab_list = tf_vectorizer.get_feature_names_out() # extract vocabulary from the prevous step
select_k_best = SelectKBest(score_func=chi2, k= int(len(vocab_list)*.104)) # Only selecting top 10.4% of vocabulary based on the chi-square values
train_sentiment_np_array = np.array(train_sentiment)
select_k_best.fit(feature_array, train_sentiment_np_array)
mask = select_k_best.get_support()
k_best_feature = vocab_list[mask] # Select only K-best vocabulary as features for training
k_score = select_k_best.scores_
k_score = k_score[mask]
k_score

array([117.49553041,  72.7897542 ,  29.98804258,  56.44754701,
        74.36370902,  43.38379699,  16.55056932, 152.39033186,
        17.66473916,  92.98613464,  52.13520338,  41.84771814,
        24.46388616,  35.2801613 ,  25.49318514,  22.32829091,
        33.07515707,  99.77394068,  49.65302297,  34.58289896,
        17.93845493,  22.27007106,  23.60806556,  26.7673348 ,
        19.04477528,  24.42742745,  63.4631454 ,  20.14418664,
        43.48158212,  67.12754306,  17.01099046,  18.24795903,
        38.0899651 ,  92.78385777,  28.89877203,  27.24625673,
        37.52213123, 338.49366885,  25.0368445 ,  35.85577198,
        26.1011196 ,  24.15164985,  16.57736668,  26.48603526,
        33.07236701,  25.66369063,  37.90444654,  65.99557239,
        19.73994145,  17.31681769,  22.40461393,  43.53327206,
        22.20017458, 199.0904724 ,  39.46791812,  17.97674507,
        41.02583686,  69.46757439,  23.09513494,  42.86334196,
        55.14142096,  26.32131335,  19.40438743,  36.01

In [1508]:
k_best_feature.shape

(99,)

### Supervised Chi-Square weight for k-best term by utilising target value

In [1509]:
# Build Tf-Idf table with K-best features selected in the previous step
from sklearn.feature_extraction.text import TfidfVectorizer
selected_tf_vectorizer = TfidfVectorizer(use_idf=True, vocabulary=k_best_feature, sublinear_tf=True, ngram_range=(1,2))
selected_feat_array = selected_tf_vectorizer.fit_transform(train_review).toarray()
selected_test_array = selected_tf_vectorizer.transform(test_review).toarray()

In [1510]:
# To extract Chi-sqare value we need observed and expected frequency table
observed_value_table = pd.DataFrame(selected_tf_vectorizer.get_feature_names_out(), columns=['features'])

In [1511]:
# Loading Tf-Idf and sentiments to Data Frame for calculating chi-sqare value
tf_data_frame = pd.DataFrame(selected_feat_array,columns = selected_tf_vectorizer.get_feature_names_out())
tf_data_frame['sentiments'] = np.array(train_sentiment)

In [1512]:
# Calculates the sum of features presence according to its presence in positive and negative reviews
observed_value_table['positive_sentiment'] = observed_value_table.features.apply(lambda feature: tf_data_frame.loc[tf_data_frame['sentiments']==1,feature].sum())
observed_value_table['negative_sentiment'] = observed_value_table.features.apply(lambda feature: tf_data_frame.loc[tf_data_frame['sentiments']==-1,feature].sum())
observed_value_table['total_row_count'] = observed_value_table['positive_sentiment'] + observed_value_table['negative_sentiment']
observed_value_table

Unnamed: 0,features,positive_sentiment,negative_sentiment,total_row_count
0,always,603.189705,224.982624,828.172328
1,amaze,266.453306,37.288465,303.741771
2,amazing,136.244773,24.295907,160.540680
3,ask,180.898781,441.135587,622.034369
4,awesome,310.024743,61.789369,371.814112
...,...,...,...,...
94,wonderful,172.932561,34.279144,207.211704
95,wont,86.824651,219.829920,306.654571
96,would,382.290551,665.825364,1048.115914
97,wouldnt,50.765720,150.047241,200.812961


In [1513]:
# Calculated expected value table
chi_sqare_expected = pd.DataFrame()

def chi_sqare_value(row_totals, positive_column_total, negative_column_total, total_sum):
    positive_chi_value = []
    negative_chi_value = []
    for i in range(0, len(row_totals)):
        positive_chi_value.append((row_totals[i]*positive_column_total)/total_sum)
        negative_chi_value.append((row_totals[i]*negative_column_total)/total_sum)
    return positive_chi_value, negative_chi_value

In [1514]:
row_totals = np.array(observed_value_table['total_row_count'])
positive_column_total = observed_value_table['positive_sentiment'].sum()
negative_column_total = observed_value_table['negative_sentiment'].sum()
total_sum = observed_value_table['total_row_count'].sum()
expected_positive, expected_negative = chi_sqare_value(row_totals, positive_column_total, negative_column_total, total_sum)

In [1515]:
chi_sqare_expected['+1'] = np.array(expected_positive)
chi_sqare_expected['-1'] = np.array(expected_negative)

In [1516]:
# This will be our FINAL Chi-Square table derived from observed and expected Frequency
chi_sqare_table = pd.DataFrame()

In [1517]:
chi_sqare_table['positiev_sentiments'] = ((observed_value_table['positive_sentiment'] - chi_sqare_expected['+1'])**2)/chi_sqare_expected['+1']
chi_sqare_table['negatiev_sentiments'] = ((observed_value_table['negative_sentiment'] - chi_sqare_expected['-1'])**2)/chi_sqare_expected['-1']
chi_sqare_table['sum'] = chi_sqare_table['positiev_sentiments'] + chi_sqare_table['negatiev_sentiments']

In [1518]:
chi_sqare_table

Unnamed: 0,positiev_sentiments,negatiev_sentiments,sum
0,102.827931,96.203665,199.031596
1,97.489390,91.209038,188.698429
2,44.322325,41.467042,85.789366
3,47.707891,44.634507,92.342397
4,94.477328,88.391015,182.868343
...,...,...,...
94,52.878628,49.472140,102.350768
95,25.433782,23.795315,49.229097
96,30.509930,28.544454,59.054384
97,22.084196,20.661512,42.745708


### Create weighted features by utilising respective chi-square weight of features

In [1519]:
def generate_chi_square_weighted_features(feat_to_be_weighted, chi_weight):
    new_weighted_feature= []
    numpy_feat_array = np.array(feat_to_be_weighted)
    for i in range(0,len(numpy_feat_array)):
            new_weighted_feature.append(np.multiply(numpy_feat_array[i], chi_weight))
    return new_weighted_feature

In [1520]:
chi_weight = np.array (chi_sqare_table['sum'])
weighted_selected_feat = np.array(generate_chi_square_weighted_features(selected_feat_array, chi_weight))
weighted_test_feat = np.array(generate_chi_square_weighted_features( selected_test_array, chi_weight))

## K-NN from scratch

In [1521]:
# Distance of a test data from training data
from sklearn.metrics.pairwise import euclidean_distances
def calculate_neighbour(train_data, test_data):
    
    numpy_distances = np.array(euclidean_distances(train_data,[test_data]).flatten())
    inv_distance = []

    # Divided by zero exception handeled by adding 1.0000000000000004e+63 (This is weight of a test data point which is exactly same as train point)
    for i in numpy_distances:
        inv_distance.append(np.divide(1,max(i**3,.000000000000000000001**3))) # taking inverse of distance because nearest neighbours will have higher contribution in prediction

    inv_distance = np.array(inv_distance)
    indexes_by_shortest_dist = inv_distance.argsort() # This returns the indexes after shoring the distances
    # Reverse it as Higher Weight corresponds to nearest neighbour
    return  np.flip(indexes_by_shortest_dist)

In [1522]:
def find_majority_and_predict(train_data, test_data, train_sentiments, K):
    nearest_neighbors_indexes = calculate_neighbour(train_data, test_data)
    sentiment_classes = []
    # take sentiments of K-best train data points which best represents the class of test data point
    for i in range(0,K):
        sentiment_classes.append(train_sentiments[nearest_neighbors_indexes[i]])
    return max(sentiment_classes, key=sentiment_classes.count)

In [None]:
# Predict sentiment
train_sentiments_array = np.array(train_sentiment)
test_sentiments_array = np.array(test_sentiment)
predicted_sentiments = []
print(len(weighted_test_feat))
for i in range(0,len(weighted_test_feat)):
    # print(i)
    predicted_sentiments.append(find_majority_and_predict(weighted_selected_feat,weighted_test_feat[i], train_sentiments_array, 23))
    

1800


In [None]:
# Calulate accuracy of prediction by cross validation
def find_accuracy(predicted_sentiment, real_sentiment):
    correct_prediction = 0
    for i in range(0,len(predicted_sentiments)):
        if predicted_sentiment[i] == real_sentiment[i]:
            correct_prediction += 1
    return np.divide(correct_prediction, len(predicted_sentiment))

In [None]:
print(find_accuracy(predicted_sentiments,test_sentiments_array))

### Prepare Unlabeled Test Data

In [None]:
result_test_df = pd.read_csv('1661892619_9579706_test_file.csv', names=['reviews'])
basic_cleaning(result_test_df)
tokenize_data(result_test_df)
result_test_df['stop_words_cleaned'] = result_test_df.words.apply(lambda word_list: remove_stopwords(word_list))
result_test_df['lemma_word'] = result_test_df.stop_words_cleaned.apply(lambda word_list: find_lemma_word(word_list))
result_test_df['cleaned_review'] = result_test_df.lemma_word.apply(lambda review_list: " ".join(review_list))
unlabeled_test_feat_matrix = selected_tf_vectorizer.transform(result_test_df['cleaned_review']).toarray()
unlabeled_test_feat_matrix.shape

In [None]:
unlabled_weighted_data = generate_chi_square_weighted_features(unlabeled_test_feat_matrix, chi_weight)

In [None]:
predicted_sentiments = []
print(len(unlabled_weighted_data))
for i in range(0,len(unlabled_weighted_data)):
    # print(i)
    predicted_sentiments.append(find_majority_and_predict(weighted_selected_feat,unlabled_weighted_data[i], train_sentiments_array, 23))


In [None]:
len(predicted_sentiments)

In [None]:
import csv
outfile = open('./results.csv','w')
out = csv.writer(outfile)
out.writerows(map(lambda x: [x], predicted_sentiments))
outfile.close()