In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import re
import pickle 

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import heapq

# nltk.download('stopwords')

In [2]:
#### LOADING THE DATASET ####

# from sklearn.datasets import load_files
# reviews_train = load_files('data/train/')
# reviews_test = load_files('data/test/')
# X_train,y_train = reviews_train.data,reviews_train.target
# X_test,y_test = reviews_test.data,reviews_test.target


#### SAVING THE DATASET ####

# with open('X_train.pickle','wb') as f:
#     pickle.dump(X_train,f)

# with open('X_test.pickle','wb') as f:
#     pickle.dump(X_test,f)  
    
# with open('y_train.pickle','wb') as f:
#     pickle.dump(y_train,f)

# with open('y_test.pickle','wb') as f:
#     pickle.dump(y_test,f)


#### LOADING THE DATASET ####

with open('X_train.pickle','rb') as f:
    X_train = pickle.load(f)
    
with open('X_test.pickle','rb') as f:
    X_test = pickle.load(f)
    
with open('y_train.pickle','rb') as f:
    y_train = pickle.load(f)
    
with open('y_test.pickle','rb') as f:
    y_test = pickle.load(f)
    

X = X_train + X_test

y = np.concatenate([y_train,y_test])
X = X[:50000]
y = y[:50000]

In [3]:
def remove_special_symbols(sample):
    review = re.sub(r'\W', ' ', sample)
    review = re.sub(r'\d', ' ', review)
    review = review.lower()
    review = re.sub(r'br[\s$]', ' ', review)
    review = re.sub(r'\s+[a-z][\s$]', ' ',review)
    review = re.sub(r'b\s+', '', review)
    review = re.sub(r'\s+', ' ', review)
    return review
    

def process_words(words, corpus):
    new_words = []
    temp_word = ''
    
    for word in words:
        antonyms = []
        if temp_word in uncheck_words:
            if word not in stop_words:
                word = 'not_' + word
                temp_word = ''
        elif temp_word == 'not_':
            for syn in wordnet.synsets(word):
                for s in syn.lemmas():
                    for a in s.antonyms():
                        antonyms.append(a.name())
            if len(antonyms) >= 1:
                word = antonyms[0]
            else:
                word = temp_word + word
            temp_word = ''
                
        if word in uncheck_words:
            temp_word = word  
        elif word == 'not':
            temp_word = 'not_'
        
        if word != 'not' and word not in uncheck_words:
            new_words.append(word)
            
    review = ' '.join(new_words)
    corpus.append(review)
    return corpus


def process_data(sample, uncheck_words, stop_words):
    corpus = []
    for i in range(0, len(sample)):
        review = remove_special_symbols(str(sample[i]))
        words = review.split(' ')
        corpus = process_words(words, corpus)
    return corpus

In [4]:
stop_words = stopwords.words('english')
uncheck_words = ['don','won','doesn','couldn','isn','wasn','wouldn','can','ain','shouldn','not','havn','hadn','hasn','aren']

corpus = []
corpus = process_data(X, uncheck_words, stop_words)

In [5]:
# len(corpus)

In [6]:
def count_words(corpus, stop_words):
    word2count = {}
    for data in corpus:
        words = nltk.word_tokenize(data)
        for word in words:
            if word not in stop_words:
                if word not in word2count.keys():
                    word2count[word] = 1
                else:
                    word2count[word] += 1
    return word2count
     



def find_idf(corpus, word2count, freq_words):
    word_idfs = {}
    for word in freq_words:
        doc_count = 0
        for data in corpus:
            if word in nltk.word_tokenize(data):
                doc_count += 1
        word_idfs[word] = np.log(len(corpus)/(1+doc_count))
    return word_idfs

def find_tf(corpus, freq_words):
    tf_matrix = {}
    for word in freq_words:
        doc_tf = []
        for data in corpus:
            frequency = 0
            for w in nltk.word_tokenize(data):
                if word == w:
                    frequency += 1
            tf_word = frequency/len(nltk.word_tokenize(data))
            doc_tf.append(tf_word)
        tf_matrix[word] = doc_tf
    return tf_matrix


    

def find_tfidf(corpus, num, stop_words):
    tfidf_matrix = []
    word2count = count_words(corpus, stop_words)
    freq_words = heapq.nlargest(num,word2count,key=word2count.get)
    print(freq_words)
    tf_matrix = find_tf(corpus, freq_words)
    word_idfs = find_idf(corpus, word2count, freq_words)
    for word in tf_matrix.keys():
        tfidf = []
        for value in tf_matrix[word]:
            score = value * word_idfs[word]
            tfidf.append(score)
        tfidf_matrix.append(tfidf)   
    X = np.asarray(tfidf_matrix)
    X = np.transpose(X)
    return X

In [57]:
X = find_tfidf(corpus[:1000], 200, stop_words)

with open('TFIDF_scratch.pickle','wb') as f:
    pickle.dump(X,f) 

['movie', 'film', 'one', 'like', 'good', 'time', 'would', 'story', 'really', 'even', 'see', 'well', 'also', 'first', 'great', 'much', 'made', 'people', 'way', 'bad', 'could', 'movies', 'make', 'characters', 'love', 'get', 'two', 'life', 'many', 'show', 'character', 'films', 'plot', 'seen', 'never', 'watch', 'acting', 'ever', 'man', 'little', 'best', 'think', 'better', 'still', 'work', 'end', 'back', 'scene', 'watching', 'director', 'though', 'something', 'xc', 'scenes', 'old', 'go', 'makes', 'know', 'another', 'actually', 'thing', 'actors', 'real', 'young', 'world', 'cast', 'say', 'lot', 'years', 'got', 'new', 'may', 'every', 'seems', 'us', 'things', 'around', 'find', 'part', 'fact', 'nothing', 'original', 'however', 'times', 'look', 'must', 'funny', 'music', 'give', 'pretty', 'going', 'take', 'quite', 'whole', 'family', 'almost', 'action', 'long', 'script', 'girl', 'far', 'interesting', 'gets', 'always', 'without', 'comedy', 'want', 'enough', 'away', 'right', 'least', 'woman', 'come',

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer
tiv = TfidfVectorizer(max_features = 2000, min_df = 2, norm="l2", use_idf=True, sublinear_tf = True, max_df = 0.6, stop_words = stop_words)
X = tiv.fit_transform(corpus).toarray()

with open('TFIDF.pickle','wb') as f:
    pickle.dump(X,f) 

In [8]:
with open('TFIDF.pickle','rb') as f:
    X = pickle.load(f)

In [9]:
# with open('TFIDF_scratch.pickle','rb') as f:
#     X = pickle.load(f)

In [10]:
from sklearn.model_selection import train_test_split
text_train, text_test, sent_train, sent_test = train_test_split(X, y[:1000], test_size = 0.2, random_state = 0)

In [11]:
from sklearn.svm import LinearSVC
classifier = LinearSVC(C = 0.1)
classifier.fit(text_train,sent_train)

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [12]:
with open('svcclassifier.pickle','wb') as f:
    pickle.dump(classifier,f)
    
with open('svcclassifier.pickle','rb') as f:
    classifier = pickle.load(f)

In [13]:
sent_pred = classifier.predict(text_test)

In [14]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(sent_test, sent_pred)
cm

array([[70, 36],
       [29, 65]])

In [15]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = text_train, y = sent_train, cv = 10)
print(accuracies)
print(accuracies.mean())
print(accuracies.std())

[0.72839506 0.72839506 0.7125     0.7875     0.7        0.7625
 0.775      0.7625     0.74683544 0.79746835]
0.7501093920925144
0.030735461835207114


In [None]:
with open('TFIDF.pickle','rb') as f:
    tfidf = pickle.load(f)
    


In [20]:
X.shape

(1000, 200)