In [1]:
#system libraries
import os
import string
#linear algebra
import numpy as np
import pandas as pd
#plotting
import seaborn as sns
import matplotlib.pyplot as plt
#machine learning and nlp
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize,word_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle


<h2> Data-PreProcessing </h2>

In [2]:
#filepath
training_positive = "./train/pos/"
training_negative = "./train/neg/"

ps = PorterStemmer()
tokenizer = RegexpTokenizer(r"\w+")
#read positive training data into a list
train_list = []

for file in os.listdir(training_positive) : 
    
    if file.endswith(".txt") : 
        
        with open(training_positive + file) as fh : 
            text = fh.read()
            #get tokens
            toks = list(map(lambda s:s.lower(), tokenizer.tokenize(text)))
            #remove punctuation
            punctuation = list(string.punctuation)
            punctuation.append("''")
            tokens = [token for token in toks if token not in punctuation ]
            #remove stop words
            st_words = stopwords.words("english")
            tokens = [token for token in tokens if token not in st_words]
            #perform stemming
            tokens = [ps.stem(token) for token in tokens]
            train_list.append(" ".join(tokens))
            





In [3]:

print(len(train_list))

12500


In [4]:
#read negative train data into the list
for file in os.listdir(training_negative) : 
    
    if file.endswith(".txt") : 
        
        with open(training_negative + file) as fh : 
            text = fh.read()
            #get tokens
            toks = list(map(lambda s:s.lower(), tokenizer.tokenize(text)))
            #remove punctuation
            punctuation = list(string.punctuation)
            punctuation.append("''")
            tokens = [token for token in toks if token not in punctuation ]
            #remove stop words
            st_words = stopwords.words("english")
            tokens = [token for token in tokens if token not in st_words]
            #perform stemming
            tokens = [ps.stem(token) for token in tokens]
            train_list.append(" ".join(tokens))
            
print(len(train_list))

25000


<h2> Vectorize the train data </h2>

In [5]:
#initiate count vectorizer
vect = CountVectorizer(analyzer = "word", binary = False, max_features = 5000)
vect.fit(train_list)

CountVectorizer(max_features=5000)

In [20]:
#transform data
X = vect.transform(train_list)
X = X.toarray()
print(X[0:1000])

[[0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [21]:
#delete to save ram
del train_list

In [22]:
df = pd.DataFrame(X, columns=vect.get_feature_names())
df.head(10)

Unnamed: 0,00,000,10,100,11,12,13,13th,14,15,...,young,younger,youngest,youth,zane,zero,zizek,zombi,zone,zoom
0,0,0,1,0,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<p>appending target labels</p>

<p>first 12500 values are 1 or positive sentiments</p>


In [23]:
del X

In [24]:
y_vals = [1 if ele <12500 else 0 for ele in range(0, 25000) ]

In [25]:
df['sentiment'] = y_vals

In [26]:
df.to_csv("./train2.csv")

<h2> Test Data Pre-processing </h2>

In [9]:
#filepath
testing_positive = "./test/pos/"
testing_negative = "./test/neg/"

ps = PorterStemmer()
tokenizer = RegexpTokenizer(r"\w+")
#read positive training data into a list
test_list = []

for file in os.listdir(testing_positive) : 
    
    if file.endswith(".txt") : 
        
        with open(testing_positive + file) as fh : 
            text = fh.read()
            #get tokens
            toks = list(map(lambda s:s.lower(), tokenizer.tokenize(text)))
            #remove punctuation
            punctuation = list(string.punctuation)
            punctuation.append("''")
            tokens = [token for token in toks if token not in punctuation ]
            #remove stop words
            st_words = stopwords.words("english")
            tokens = [token for token in tokens if token not in st_words]
            #perform stemming
            tokens = [ps.stem(token) for token in tokens]
            test_list.append(" ".join(tokens))
            



In [10]:
print(len(test_list))

12500


In [12]:
#read negative train data into the list
for file in os.listdir(testing_negative) : 
    
    if file.endswith(".txt") : 
        
        with open(testing_negative + file) as fh : 
            text = fh.read()
            #get tokens
            toks = list(map(lambda s:s.lower(), tokenizer.tokenize(text)))
            #remove punctuation
            punctuation = list(string.punctuation)
            punctuation.append("''")
            tokens = [token for token in toks if token not in punctuation ]
            #remove stop words
            st_words = stopwords.words("english")
            tokens = [token for token in tokens if token not in st_words]
            #perform stemming
            tokens = [ps.stem(token) for token in tokens]
            test_list.append(" ".join(tokens))
            
print(len(test_list))

25000


In [13]:
#transform data
X = vect.transform(test_list)
X = X.toarray()
print(X[0:1000])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [14]:
#delete to save ram
del test_list

In [15]:
df = pd.DataFrame(X, columns=vect.get_feature_names())
df.head(10)

Unnamed: 0,00,000,10,100,11,12,13,13th,14,15,...,young,younger,youngest,youth,zane,zero,zizek,zombi,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [16]:
del X

In [17]:
y_vals = [1 if ele <12500 else 0 for ele in range(0, 25000) ]

In [18]:
df['sentiment'] = y_vals

In [19]:
df.to_csv("./test2.csv")