In [1]:
import codecs
from os import listdir
import pickle
import re
import copy
from collections import Counter
import numpy as np
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords

In [14]:
#class variables
#List of words
# neg_token = list()
# pos_token = list()
neg_vocab = Counter() #counter, so that frequency will be added not replaced
pos_vocab = Counter()
# this dict will store words without any common words.
neg_dict = dict()
pos_dict = dict()



In [3]:
#Read Data
def load_data(filename):
    #open text file
    with codecs.open(filename, 'r', encoding = 'utf-8') as file:
        text = file.read()
    return text   
    

In [4]:
# Data cleaning
def clean_data(text):
    word_pattern = re.compile("^\w+$")
    esw = stopwords.words('english')
    esw.append("would")
    
    #tokenize, stemming, converting to lower...
    tokens = WordPunctTokenizer().tokenize(PorterStemmer().stem(text))
    tokens = list(map(lambda x: x.lower(), tokens))
    tokens = [token for token in tokens if re.match(word_pattern, token) and token not in esw]
    tokens = [word for word in tokens if len(word) > 1]
#     print(type(tokens))
    return tokens

In [5]:
#Generate vocabulary
def gen_vocab(filename, vocab):
    text = load_data(filename)
    tokens = clean_data(text)
    #update tokens with their frequency as a dict but doesnt replace old value like dict
    vocab.update(tokens) #Counter-helps to add frequency for same word unlike dict-replace the old value with new
    if vocab == neg_vocab:
        neg_token.append(tokens)
        
    elif vocab == pos_vocab:
        pos_token.append(tokens)
        

In [6]:
#Run all the text documents of the directory and call func to update vocabulary
def run_all_docs(directory, vocab):
    #run all the text file here through loop
    for filename in listdir(directory):
        #leave the file it's not a text file
        if not filename.endswith(".txt"):
            continue
        #make complete path    
        path = directory + '/' + filename
        gen_vocab(path, vocab)

In [30]:
#after making vocab with counter(freq), it will remove common word
def remove_common_words(dict_to_clear, dict_to_refer):
    u_word = dict()
    for k,v in dict_to_clear.items():
        if (k not in dict_to_refer) or ((k in dict_to_refer) and (v > dict_to_refer[k])):
            u_word[k] = v
    return u_word

In [31]:
def build_pkl(vocab):
    if vocab == neg_vocab:
        #save Vocab with their freuency in dataframe as pkl
        neg_df_no_comm = pd.DataFrame.from_dict(neg_dict, orient = 'index').reset_index()
        neg_df_no_comm.columns = ['word', 'count']
        neg_df_no_comm['type'] = 0
        neg_df_no_comm.to_pickle("C:/Users/family/Desktop/Sentiment_NLP/Dictionary/neg_df_no_comm.pkl")
        
    elif vocab == pos_vocab:
        #save Vocab with their freuency in dataframe as pkl
        pos_df_no_comm = pd.DataFrame.from_dict(pos_dict, orient = 'index').reset_index()
        pos_df_no_comm.columns = ['word', 'count']
        pos_df_no_comm['type'] = 1
        pos_df_no_comm.to_pickle("C:/Users/family/Desktop/Sentiment_NLP/Dictionary/pos_df_no_comm.pkl")
        

In [7]:
# #To build a dataframe of vocabulary with column "type" where o = negative and 1 =positive
# def build_pkl(vocab):
    
#     if vocab == neg_vocab:
#         #save Vocab with their freuency in dataframe as pkl
#         neg_df = pd.DataFrame.from_dict(neg_vocab, orient = 'index').reset_index()
#         neg_df.columns = ['word', 'count']
#         neg_df['type'] = 0
#         neg_df.to_pickle("C:/Users/family/Desktop/Sentiment_NLP/neg_df.pkl")
               
#         #pickle the list of tokens
#         f = open("C:/Users/family/Desktop/Sentiment_NLP/neg_token.pkl", 'wb')
#         pickle.dump(neg_token, f)
#         f.close()
        
#     elif vocab == pos_vocab:
#         #save Vocab with their freuency in dataframe as pkl
#         pos_df = pd.DataFrame.from_dict(pos_vocab, orient = 'index').reset_index()
#         pos_df.columns = ['word', 'count']
#         pos_df['type'] = 1
#         pos_df.to_pickle("C:/Users/family/Desktop/Sentiment_NLP/pos_df.pkl")
                        
#         #pickle the list of tokens
#         f = open("C:/Users/family/Desktop/Sentiment_NLP/pos_token.pkl", 'wb')
#         pickle.dump(pos_token, f)
#         f.close()        
        
    

In [32]:
def sentiment_vocab(path,type):
    if type == "neg":
        vocab = neg_vocab
        run_all_docs(path, vocab)         
        neg_dict = (remove_common_words(neg_vocab, pos_vocab))#remove common words and make a dictionary
        build_pkl(vocab)
    elif type == "pos":
        vocab = pos_vocab
        run_all_docs(path, vocab)
        pos_dict = (remove_common_words(pos_vocab, neg_vocab))#remove common words and make a dictionary 
        build_pkl(vocab)
    else:
        print("Wrong vocabulary type \nPlease enter either neg or pos")


In [33]:
#physical path of text file which have movie reviews 
neg_path ="C:/Users/family/Desktop/Sentiment_NLP/Data/txt_sentoken/neg"
pos_path ="C:/Users/family/Desktop/Sentiment_NLP/Data/txt_sentoken/pos"

# it's taking physical path of txt file and type of sentiment as a parameter
sentiment_vocab(pos_path, "pos") 
sentiment_vocab(neg_path, "neg")

In [98]:
#Making DataFrame which doesnt have common words in both (negative and postive)

In [29]:
print(len(neg_dict), len(pos_dict)) 

15835 20246


In [106]:
# print(neg_df.shape)
# print(pos_df.shape)
# print(neg_df_no_comm.shape)
# print(pos_df_no_comm.shape)