In [1]:
import pandas as pd
import numpy as np
import csv
import re
from string import punctuation
from string import digits
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from os import listdir
from os.path import isfile, join

In [2]:
stop_words = set(stopwords.words('english'))

def removeStopwords(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    return text

In [3]:
sampled_test = sorted([join('sampled_test', f) for f in listdir('sampled_test') if isfile(join('sampled_test', f))])
sampled_train = sorted([join('sampled_train', f) for f in listdir('sampled_train') if isfile(join('sampled_train', f))])

d = pd.read_csv('annotations_metadata.csv')
d = np.array(d.iloc[:, [0,4]])
dic = {}
for i in d:
    dic[i[0]] = i[1]

X_train = []
X_test = []
y_train = []
y_test = []

for txt in sampled_train:
    txt_id = txt.split('\\')[-1][:-4]
    with open(txt, encoding="utf8") as f:
        text = str(f.read())
        text = text.lower()
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.split()
        ps = PorterStemmer()
        text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
        if len(text)!=0:
            X_train.append(" ".join(text))
            y_train.append(dic[txt_id])

for txt in sampled_test:
    txt_id = txt.split('\\')[-1][:-4]
    with open(txt, encoding="utf8") as f:
        text = str(f.read()).lower()
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.split()
        ps = PorterStemmer()
        text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
        if len(text)!=0:
            X_test.append(" ".join(text))
            y_test.append(dic[txt_id])

text = str(" ".join(X_train)) + " " + str(" ".join(X_test))
corpus = list(set(text.split()))

In [4]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary=corpus)

# Tfidf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer(vocabulary=corpus)

X = vectorizer.fit_transform(X_train)
X_train = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names())

X = vectorizer.transform(X_test)
X_test = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names())

In [5]:
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y_train = labelencoder_y.fit_transform(y_train)
y_test = labelencoder_y.transform(y_test)

In [6]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(penalty='l1',max_iter=50,solver='liblinear')
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
from sklearn.metrics import accuracy_score
y_pred = classifier.predict(X_test)
print("Accuracy :", accuracy_score(y_test, y_pred))

Accuracy : 0.7109704641350211


In [8]:
print(classifier.coef_)

[[0. 0. 0. ... 0. 0. 0.]]


In [9]:
print(classifier.intercept_)

[1.21197962]


In [10]:
from sklearn.linear_model import Perceptron
clf = Perceptron(penalty='l1',alpha=0.0001,max_iter=500,random_state=0)
clf.fit(X_train, y_train)



Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
      fit_intercept=True, max_iter=500, n_iter=None, n_iter_no_change=5,
      n_jobs=None, penalty='l1', random_state=0, shuffle=True, tol=None,
      validation_fraction=0.1, verbose=0, warm_start=False)

In [15]:
pred = clf.predict(X_test)
print("Accuracy :", accuracy_score(y_test, pred))

Accuracy : 0.6814345991561181


In [16]:
print(clf.coef_)

[[0.   0.65 0.   ... 0.   0.   0.  ]]


In [17]:
print(clf.intercept_)

[1.]


In [18]:
# with stopwords

X_train1 = []
X_test1 = []
y_train1 = []
y_test1 = []

for txt in sampled_train:
    txt_id = txt.split('\\')[-1][:-4]
    with open(txt, encoding="utf8") as f:
        text = str(f.read())
        text = text.lower()
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.split()
        ps = PorterStemmer()
        if len(text)!=0:
            X_train1.append(" ".join(text))
            y_train1.append(dic[txt_id])

for txt in sampled_test:
    txt_id = txt.split('\\')[-1][:-4]
    with open(txt, encoding="utf8") as f:
        text = str(f.read()).lower()
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.split()
        ps = PorterStemmer()
        if len(text)!=0:
            X_test1.append(" ".join(text))
            y_test1.append(dic[txt_id])

text = str(" ".join(X_train1)) + " " + str(" ".join(X_test1))
corpus1 = list(set(text.split()))

In [19]:
# Count Vectorizer
# vectorizer1 = CountVectorizer(vocabulary=corpus1)

# Tfid fVectorizer
vectorizer1 = TfidfVectorizer(vocabulary=corpus1)
X = vectorizer1.fit_transform(X_train1)
X_train1 = pd.DataFrame(data=X.toarray(), columns=vectorizer1.get_feature_names())

X = vectorizer1.transform(X_test1)
X_test1 = pd.DataFrame(data=X.toarray(), columns=vectorizer1.get_feature_names())

y_train1 = labelencoder_y.fit_transform(y_train1)
y_test1 = labelencoder_y.fit_transform(y_test1)

In [20]:
# logistic regression
classifier1 = LogisticRegression()
classifier1.fit(X_train1, y_train1)
y_pred1 = classifier1.predict(X_test1)
print("Accuracy :", accuracy_score(y_test1, y_pred1))

Accuracy : 0.7557894736842106




In [21]:
# Perceptron
clf1 = Perceptron()
clf1.fit(X_train1, y_train1)
pred1 = clf1.predict(X_test1)
print("Accuracy :", accuracy_score(y_test1, pred1))



Accuracy : 0.6926315789473684
