In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import glob
import re
import os

In [2]:
#append the path of the folder where the files are stored
dir = []
dir.append(r"D:\projects\motoon_2022\Assignment_5\session_5\sentiment_data\train\pos")
dir.append(r"D:\projects\motoon_2022\Assignment_5\session_5\sentiment_data\train\neg")
dir.append(r"D:\projects\motoon_2022\Assignment_5\session_5\sentiment_data\test\pos")
dir.append(r"D:\projects\motoon_2022\Assignment_5\session_5\sentiment_data\test\neg")

In [2]:
# Function to remove some invalid strings from the text
def removeAllInvalid(text):
    text = text.replace("<br />", " ")
    pattern = re.compile(r'[\r\n\t]{1,}')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'[ ]{2,}')
    text = re.sub(pattern, ' ', text)
    return text.strip()

In [3]:
# Get labels for the data
def getLabel(absolute):
    basename = os.path.basename(absolute)
    info = os.path.splitext(basename)
    filename = info[0]
    fileInfo = filename.split("_")
    return fileInfo[1]

In [5]:
#read all text files one by one and write file content and label into a file
file_big = 'aclImdb-all.txt'
with open(file_big, 'w', encoding = 'utf-8') as fnew:
    for d in dir:
        files = glob.glob(d + '\\*.txt')
        for f in files:
            content = ''
            with open(f, 'r', encoding = 'utf-8') as fold:
                content = fold.read()
            content = removeAllInvalid(content)
            #get label
            label = getLabel(f)
            
            fnew.write(content + "\t" + label + "\n")

In [4]:
# read the data
df = pd.read_csv('aclImdb-all.txt', sep = '\t', header = None)
df.columns = ['text', 'label']
df.head()

Unnamed: 0,text,label
0,Bromwell High is a cartoon comedy. It ran at t...,9
1,Homelessness (or Houselessness as George Carli...,8
2,Brilliant over-acting by Lesley Ann Warren. Be...,10
3,This is easily the most underrated film inn th...,7
4,This is not the typical Mel Brooks film. It wa...,8


In [5]:
# Data preprocessing
# Remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
def removeStopWords(text):
    return ' '.join([word for word in text.split() if word not in stop])
df['text'] = df['text'].apply(removeStopWords)
df.head()

Unnamed: 0,text,label
0,Bromwell High cartoon comedy. It ran time prog...,9
1,Homelessness (or Houselessness George Carlin s...,8
2,Brilliant over-acting Lesley Ann Warren. Best ...,10
3,This easily underrated film inn Brooks cannon....,7
4,This typical Mel Brooks film. It much less sla...,8


In [6]:
# Remove punctuation
import string
punctuation = string.punctuation
def removePunctuation(text):
    return ''.join([char for char in text if char not in punctuation])

df['text'] = df['text'].apply(removePunctuation)
df.head()

Unnamed: 0,text,label
0,Bromwell High cartoon comedy It ran time progr...,9
1,Homelessness or Houselessness George Carlin st...,8
2,Brilliant overacting Lesley Ann Warren Best dr...,10
3,This easily underrated film inn Brooks cannon ...,7
4,This typical Mel Brooks film It much less slap...,8


In [7]:
# Remove numbers
def removeNumbers(text):
    return ''.join([char for char in text if not char.isdigit()])
df['text'] = df['text'].apply(removeNumbers)
df.head()

Unnamed: 0,text,label
0,Bromwell High cartoon comedy It ran time progr...,9
1,Homelessness or Houselessness George Carlin st...,8
2,Brilliant overacting Lesley Ann Warren Best dr...,10
3,This easily underrated film inn Brooks cannon ...,7
4,This typical Mel Brooks film It much less slap...,8


In [8]:
# Remove URLs
def removeURLs(text):
    return ' '.join([word for word in text.split() if not word.startswith('http')])
df['text'] = df['text'].apply(removeURLs)
df.head()

Unnamed: 0,text,label
0,Bromwell High cartoon comedy It ran time progr...,9
1,Homelessness or Houselessness George Carlin st...,8
2,Brilliant overacting Lesley Ann Warren Best dr...,10
3,This easily underrated film inn Brooks cannon ...,7
4,This typical Mel Brooks film It much less slap...,8


In [9]:
# Remove words with length less than 3
def removeShortWords(text):
    return ' '.join([word for word in text.split() if len(word) > 2])
df['text'] = df['text'].apply(removeShortWords)
df.head()

Unnamed: 0,text,label
0,Bromwell High cartoon comedy ran time programs...,9
1,Homelessness Houselessness George Carlin state...,8
2,Brilliant overacting Lesley Ann Warren Best dr...,10
3,This easily underrated film inn Brooks cannon ...,7
4,This typical Mel Brooks film much less slapsti...,8


In [10]:
# Remove words with length more than 15
def removeLongWords(text):
    return ' '.join([word for word in text.split() if len(word) < 15])
df['text'] = df['text'].apply(removeLongWords)
df.head()

Unnamed: 0,text,label
0,Bromwell High cartoon comedy ran time programs...,9
1,Homelessness Houselessness George Carlin state...,8
2,Brilliant overacting Lesley Ann Warren Best dr...,10
3,This easily underrated film inn Brooks cannon ...,7
4,This typical Mel Brooks film much less slapsti...,8


In [11]:
# Remove special characters
def removeSpecialCharacters(text):
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text
df['text'] = df['text'].apply(removeSpecialCharacters)
df.head()

Unnamed: 0,text,label
0,bromwell high cartoon comedy ran time programs...,9
1,homelessness houselessness george carlin state...,8
2,brilliant overacting lesley ann warren best dr...,10
3,this easily underrated film inn brooks cannon ...,7
4,this typical mel brooks film much less slapsti...,8


In [12]:
from os import system
system("mkdir csv")
df.to_csv('csv/imdb.csv', index=False)
df.head()

Unnamed: 0,text,label
0,bromwell high cartoon comedy ran time programs...,9
1,homelessness houselessness george carlin state...,8
2,brilliant overacting lesley ann warren best dr...,10
3,this easily underrated film inn brooks cannon ...,7
4,this typical mel brooks film much less slapsti...,8


In [13]:
df = pd.read_csv('csv/imdb.csv')
df.head()

Unnamed: 0,text,label
0,bromwell high cartoon comedy ran time programs...,9
1,homelessness houselessness george carlin state...,8
2,brilliant overacting lesley ann warren best dr...,10
3,this easily underrated film inn brooks cannon ...,7
4,this typical mel brooks film much less slapsti...,8


In [14]:
# Features and labels
X = df['text']
y = df['label']

In [15]:
#split the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print the shape of the train and test data
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(40000,)
(10000,)
(40000,)
(10000,)


In [16]:
# vectorization
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(X_train)
X_train_count.shape


(40000, 134250)

In [17]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
X_train_tfidf.shape

(40000, 134250)

In [23]:
# Building a pipeline
from sklearn.pipeline import Pipeline
# Import svm
from sklearn.svm import SVC

text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SVC(random_state=0))])

# Training Support Vector Machines - SVM and calculating its performance
text_clf_svm = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == y_test)

0.4401

In [40]:
accuracy_svm = np.mean(predicted_svm == y_test)
print('Accuracy of Support Vector Machine is: {}'.format(accuracy_svm))

Accuracy of Support Vector Machine is: 0.4401


In [34]:
# import logistic regression classifier
from sklearn.linear_model import LogisticRegression

text_clf_lr = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-lr', LogisticRegression(random_state=0, max_iter=5000))])

# Trainig logistic regression
text_clf_lr = text_clf_lr.fit(X_train, y_train)
predicted_lr = text_clf_lr.predict(X_test)
accuracy_lr = np.mean(predicted_lr == y_test)

print('Accuracy of Logistic regression is: {}'.format(accuracy_lr))

Accuracy of Logistic regression is: 0.4362
