In [211]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import string
import re
import os

In [212]:
train_data = pd.read_csv('1661892619_92027_train_file.csv',names=["sentiments", "reviews"])

In [213]:
def basic_cleaning(data_frame):
    data_frame['reviews'] = data_frame.reviews.apply(lambda review: review.lower())
    data_frame['reviews'] = data_frame.reviews.apply(lambda review: review.translate(str.maketrans('', '', string.punctuation)))
    data_frame['reviews'] = data_frame.reviews.apply(lambda review: review.translate(str.maketrans('', '', string.digits)))
    data_frame['reviews'] = data_frame.reviews.apply(lambda review: re.sub("r[^a-z]",'',review))

    
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

def tokenize_data(data_frame):
    data_frame['words'] = data_frame.reviews.apply(lambda review: nltk.word_tokenize(review))

from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

def remove_stopwords(list):
    stop_words_removed = []
    for i in list:
        if i not in stopwords:
            stop_words_removed.append(i)
    return stop_words_removed

In [214]:
# nltk.download('all')
#tag part of speech to get more accurate word during lemmatizaton
def tag_pos(list_of_words):
    return nltk.pos_tag(list_of_words)

#extraction of lemma words after pos taggin 
from nltk.stem import WordNetLemmatizer
lemmatizer= WordNetLemmatizer()

def find_lemma_word(word):
    lemma_words=[]
    words_with_pos = tag_pos(word)
    for word in words_with_pos:
        if word[1].startswith('NN'):
            lemma_words.append(lemmatizer.lemmatize(word[0],pos='n'))
        elif word[1].startswith('VB'):
            lemma_words.append(lemmatizer.lemmatize(word[0],pos='v'))
        elif word[1].startswith('JJ'):
            lemma_words.append(lemmatizer.lemmatize(word[0],pos='a'))
        elif word[1].startswith('RB'):
            lemma_words.append(lemmatizer.lemmatize(word[0],pos='r'))
        else:
            lemma_words.append(word[0])
            
    return lemma_words

In [215]:
basic_cleaning(train_data)
tokenize_data(train_data)
train_data['stop_words_cleaned'] = train_data.words.apply(lambda word_list: remove_stopwords(word_list))
train_data['lemma_word'] = train_data.stop_words_cleaned.apply(lambda word_list: find_lemma_word(word_list))
train_data['cleaned_review'] = train_data.lemma_word.apply(lambda review_list: " ".join(review_list))

In [216]:
train_data.head()

Unnamed: 0,sentiments,reviews,words,stop_words_cleaned,lemma_word,cleaned_review
0,-1,eat at fioris they said youll like it they sa...,"[eat, at, fioris, they, said, youll, like, it,...","[eat, fioris, said, youll, like, saidnnis, con...","[eat, fioris, say, youll, like, saidnnis, conv...",eat fioris say youll like saidnnis convenientl...
1,-1,i just dont understand the appeal ive tried t...,"[i, just, dont, understand, the, appeal, ive, ...","[dont, understand, appeal, ive, tried, place, ...","[dont, understand, appeal, ive, tried, place, ...",dont understand appeal ive tried place twice t...
2,1,this is my go to place foa really good beef en...,"[this, is, my, go, to, place, foa, really, goo...","[go, place, foa, really, good, beef, enchilada...","[go, place, foa, really, good, beef, enchilada...",go place foa really good beef enchilada red sa...
3,-1,not impressed when i ordered the oyako bowl th...,"[not, impressed, when, i, ordered, the, oyako,...","[impressed, ordered, oyako, bowl, conversation...","[impressed, order, oyako, bowl, conversation, ...",impressed order oyako bowl conversation go som...
4,-1,this is the first time evei wrote a bad review...,"[this, is, the, first, time, evei, wrote, a, b...","[first, time, evei, wrote, bad, review, frustr...","[first, time, evei, write, bad, review, frustr...",first time evei write bad review frustrate her...


### Feature subset Selection

In [217]:
from sklearn.model_selection import train_test_split
train_review, test_review, train_sentiment, test_sentiment =train_test_split(train_data.cleaned_review,train_data.sentiments,
                                                                             shuffle=True, ## shuffel to avoide sequential classes
                                                                             random_state=0,
                                                                             stratify=train_data.sentiments, ## create a balanced sample based on the target variable
                                                                             train_size=.85) ## size of our training split testing split will be 15%

In [218]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vectorizer = TfidfVectorizer(min_df=.0095, ## term must be present in atleast 0.95% of the document
                                use_idf=False, ## creating a dictionary for now
                                ngram_range=(1,2)) ## adding uni and bigram
feature_matrix = tf_vectorizer.fit_transform(train_review)
feature_array = feature_matrix.toarray()

In [219]:
vocab_list = tf_vectorizer.get_feature_names_out()
vocab_list

array(['able', 'absolutely', 'accommodate', 'across', 'act', 'actual',
       'actually', 'add', 'admit', 'afte', 'aftea', 'aftei', 'afternoon',
       'aftethe', 'ago', 'agree', 'ahead', 'airport', 'allow', 'almost',
       'alone', 'along', 'already', 'also', 'although', 'always', 'amaze',
       'amazing', 'ambiance', 'american', 'amount', 'anothe', 'anyone',
       'anything', 'anyway', 'anywhere', 'apologize', 'apparently',
       'appointment', 'appreciate', 'area', 'arent', 'arizona', 'around',
       'arrive', 'ask', 'ask foa', 'ate', 'atmosphere', 'attention',
       'attentive', 'attitude', 'authentic', 'available', 'average',
       'avoid', 'away', 'awesome', 'awful', 'az', 'baand', 'back',
       'bacon', 'bad', 'bag', 'bar', 'barely', 'base', 'basic',
       'basically', 'bathroom', 'bbq', 'bean', 'beat', 'beautiful',
       'become', 'bed', 'beef', 'beer', 'begin', 'behind', 'believe',
       'best', 'bettethan', 'beyond', 'big', 'bill', 'birthday', 'bit',
       'bite',

### Select K-best features

In [220]:
from sklearn.feature_selection import SelectKBest, chi2

vocab_list = tf_vectorizer.get_feature_names_out()
select_k_best = SelectKBest(score_func=chi2, k= int(len(vocab_list)*.104))
train_sentiment_np_array = np.array(train_sentiment)
select_k_best.fit(feature_array, train_sentiment_np_array)
mask = select_k_best.get_support()
k_best_feature = vocab_list[mask]

In [221]:
k_best_feature.shape

(99,)

In [222]:
from sklearn.feature_extraction.text import TfidfVectorizer
selected_tf_vectorizer = TfidfVectorizer(use_idf=True, vocabulary=k_best_feature, sublinear_tf=True, ngram_range=(1,2))
selected_feat_array = selected_tf_vectorizer.fit_transform(train_review).toarray()
selected_test_array = selected_tf_vectorizer.transform(test_review).toarray()
observed_value_table = pd.DataFrame(selected_tf_vectorizer.get_feature_names_out(), columns=['features'])

In [223]:
selected_tf_vectorizer.get_feature_names_out()

array(['always', 'amaze', 'amazing', 'ask', 'avoid', 'awesome', 'awful',
       'az', 'bad', 'beautiful', 'best', 'bland', 'call', 'charge',
       'charlotte', 'cold', 'could', 'definitely', 'delicious', 'didnt',
       'dirty', 'disappointed', 'disgust', 'dont', 'dry', 'easy', 'enjoy',
       'excellent', 'family', 'fantastic', 'favorite', 'food great',
       'fresh', 'friendly', 'fun', 'give', 'good', 'great', 'great food',
       'great place', 'great service', 'gross', 'happy', 'helpful',
       'highly', 'highly recommend', 'horrible', 'hotel', 'knowledgeable',
       'lack', 'leave', 'little', 'love', 'love place', 'maybe',
       'mediocre', 'mexican', 'minute', 'money', 'nice', 'nothing', 'ok',
       'okay', 'one best', 'order', 'overprice', 'pay', 'perfect',
       'phoenix', 'pittsburgh', 'recommend', 'rude', 'say', 'seem',
       'selection', 'slow', 'sorry', 'spot', 'staff', 'table', 'taste',
       'taste like', 'tell', 'terrible', 'think', 'two', 'unfortunately',
     

In [224]:
test_sentiments_array = np.array(test_sentiment)
train_sentiments_array = np.array(train_sentiment)
# chi_sqare_table['max_chi_value'] = find_max_chi_val(chi_sqare_table['positiev_sentiments'], chi_sqare_table['negatiev_sentiments'])

In [225]:
from sklearn.metrics.pairwise import euclidean_distances
def calculate_neighbour(train_data, test_data):
    
    numpy_distances = np.array(euclidean_distances(train_data,[test_data]).flatten())
    # storing inverse of distances
    inv_distance = []
    for i in numpy_distances:
        # Divided exception avoided  by using a very small value
        inv_distance.append(np.divide(1,max(i**3,.000000000000000000001**3)))

    inv_distance = np.array(inv_distance)
    indexes_by_shortest_dist = inv_distance.argsort() ## returns indexes from smallest to largest

    return  np.flip(indexes_by_shortest_dist)

In [202]:
def find_majority_and_predict(train_data, test_data, train_sentiments, K):
    nearest_neighbors_indexes = calculate_neighbour(train_data, test_data)
    sentiment_classes = []
    for i in range(0,K):
        sentiment_classes.append(train_sentiments[nearest_neighbors_indexes[i]])
        
    return max(sentiment_classes, key=sentiment_classes.count)

In [203]:
def find_accuracy(predicted_sentiment, real_sentiment):
    correct_prediction = 0
    for i in range(0,len(predicted_sentiments)):
        if predicted_sentiment[i] == real_sentiment[i]:
            correct_prediction += 1
    return np.divide(correct_prediction, len(predicted_sentiment))

In [204]:
print(len(selected_test_array))
predicted_sentiments = []
for i in range(0,len(selected_test_array)):
    # print(i)
    predicted_sentiments.append(find_majority_and_predict(selected_feat_array,selected_test_array[i], train_sentiments_array,22))
score =  find_accuracy(predicted_sentiments, test_sentiments_array)

2700


In [205]:
score

0.8318518518518518

In [206]:
result_test_df = pd.read_csv('1661892619_9579706_test_file.csv', names=['reviews'])
basic_cleaning(result_test_df)
tokenize_data(result_test_df)
result_test_df['stop_words_cleaned'] = result_test_df.words.apply(lambda word_list: remove_stopwords(word_list))
result_test_df['lemma_word'] = result_test_df.stop_words_cleaned.apply(lambda word_list: find_lemma_word(word_list))
result_test_df['cleaned_review'] = result_test_df.lemma_word.apply(lambda review_list: " ".join(review_list))
unlabeled_test_feat_matrix = selected_tf_vectorizer.transform(result_test_df['cleaned_review']).toarray()

In [207]:
unlabeled_test_feat_matrix.shape

(18000, 99)

In [208]:
predicted_sentiments = []
print(len(unlabeled_test_feat_matrix))
for i in range(0,len(unlabeled_test_feat_matrix)):
    # print(i)
    predicted_sentiments.append(find_majority_and_predict(selected_feat_array,unlabeled_test_feat_matrix[i], train_sentiments_array, 22))


18000


In [209]:
len(predicted_sentiments)

18000

In [210]:
import csv
outfile = open('./results.csv','w')
out = csv.writer(outfile)
out.writerows(map(lambda x: [x], predicted_sentiments))
outfile.close()