In [1]:
import numpy as np 
import pandas as pd
import bz2

In [2]:
def labels_text(x):
  label=[]
  text=[]
  for line in bz2.BZ2File(x):
    decode = line.decode("utf-8")
    label.append(int(decode[9]) - 1)
    text.append(decode[10:].strip())
  return np.array(label),text

train_label, train_text = labels_text('train.ft.txt.bz2')
test_label, test_text = labels_text('test.ft.txt.bz2')

In [3]:
from sklearn.utils import shuffle
train_text, train_label = shuffle(train_text, train_label)
test_text, test_label = shuffle(test_text, test_label)

In [4]:
train_text[0]

"About that extra DVD...: If you're gonna buy this cd, definately get this edition, with the bonus DVD. Although the DVD only features 4 music videos and 2 songs (it's pretty basic, and not incredibly exciting), it's a nice supplement to the CD. (plus it's free) Whatever you do, don't pay extra for the DVD disc. The CD is only about 40 minutes long, so the added DVD disc makes your purchase slightly more worth the money. Should you buy it? Well, I bought it and I wasn't disappointed. It's one of those things that if you're contemplating buying it, then go for it. The CD has an old early-60's rock-esque sound to it, yet while I listen to it, I am also reminded of the Violent Femmes."

In [5]:
train_label[0]

1

In [6]:
len(train_text)

3600000

In [7]:
len(test_text)

400000

In [8]:
train_text=train_text[0:10000]
train_label=train_label[0:10000]

In [9]:
test_text=test_text[0:2500]
test_label=test_label[0:2500]

In [10]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus_train=[]

for i in range(10000):
    review = re.sub('".*?"', '', train_text[i]) #removing any word within quotation marks
    review = re.sub('[^a-zA-Z]', ' ', review) #keeping only letters and removing anything else
    review = review.lower() #converting everything to lowercase
    review = review.split() #splitting each word in string and placing it into a list
    ps = PorterStemmer() #stemming
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not') 
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review) #forming a string form a splitted list of words
    corpus_train.append(review)

[nltk_data] Downloading package stopwords to C:\Users\Mohammed
[nltk_data]     Fouzan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus_train).toarray()
y = train_label[0:10000]

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [13]:
from sklearn.svm import SVC
classifier = SVC(C=3,kernel = 'rbf', random_state = 0)#2.0=84.52(-),1.75=84.72(83.16),1.5=84.84(83.13),3.25=84.6(-)
classifier.fit(X_train, y_train)

SVC(C=3, random_state=0)

In [14]:
y_pred = classifier.predict(X_val)

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_val, y_pred)
print(cm)
acc = accuracy_score(y_val, y_pred)
print(acc)

[[1070  207]
 [ 168 1055]]
0.85


In [16]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 84.83 %
Standard Deviation: 1.78 %


In [17]:
corpus_test=[]
for i in range(2500):
    review = re.sub('".*?"', '', test_text[i]) #removing any word within quotation marks
    review = re.sub('[^a-zA-Z]', ' ', review) #keeping only letters and removing anything else
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus_test.append(review)

In [18]:
X_test = cv.transform(corpus_test).toarray()
y_test = test_label[0:2500]

In [19]:
y_pred_new = classifier.predict(X_test)

In [21]:
cm_final = confusion_matrix(y_test, y_pred_new)
print(cm_final)
acc_final = accuracy_score(y_test, y_pred_new)
print(acc_final*100)

[[1042  217]
 [ 184 1057]]
83.96000000000001
