In [19]:
import os
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [20]:
import warnings
warnings.simplefilter('ignore')

## Models without preprocessing

In [38]:
cwd = os.getcwd()

df_train = pd.read_csv(os.path.join(cwd, 'reddit_train.csv'))


In [39]:
X_train, y_train = df_train['comments'], df_train['subreddits']

#### Tf-Idf

In [40]:
# from sklearn.feature_extraction.text import TfidfVectorizer


# #tfidf_vectorizer = TfidfVectorizer(max_features=None, smooth_idf=True, sublinear_tf=True)#, ngram_range=(1, 2))
# tfidf_vectorizer = TfidfVectorizer(max_features=None, smooth_idf=True, sublinear_tf=True)

# Encoder = LabelEncoder()

# #X_train = Encoder.fit_transform(X_train)
# y_train = Encoder.fit_transform(y_train)

# tfidf_vectorizer.fit(X_train)
# X_train = tfidf_vectorizer.transform(X_train)



#### Count Vectorization

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(X_train)
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)

## Training classifiers before preprocessing

##### Multinomial Naive Bayes

In [42]:
Naive = naive_bayes.MultinomialNB()

x = cross_val_score(Naive, X_train, y_train, cv=5)


print("Naive Bayes Classifier Accuracy (before preprocessing): ")
print("5-fold cross validation average: ", x.mean())

Naive Bayes Classifier Accuracy (before preprocessing): 
5-fold cross validation average:  0.530778982172196


##### Logistic Regression

In [43]:
logreg = LogisticRegression()
y = cross_val_score(logreg, X_train, y_train, cv=5)

print("Logistic Regression Classifier Accuracy (before preprocessing): ")
print("5-fold cross validation average: ", y.mean())

Logistic Regression Classifier Accuracy (before preprocessing): 
5-fold cross validation average:  0.5250789393119917


##### Stochastic Gradient Descent Classifier 

In [44]:
SGD = linear_model.SGDClassifier()
a = cross_val_score(SGD, X_train, y_train, cv=5)

print("Stochastic Gradient Descent Classifier Accuracy (before preprocessing): ")
print("5-fold cross validation average: ", a.mean())

Stochastic Gradient Descent Classifier Accuracy (before preprocessing): 
5-fold cross validation average:  0.4539635698468258


##### Support Vector Classifier 

In [45]:
SVC = LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', 
                 fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=10000)
          
svc_pre = cross_val_score(SVC, X_train, y_train, cv=5)
print("Support Vector Classifier Accuracy (before preprocessing): ")
print("Accuracy array Linear SVC: ", svc_pre.mean())  

Support Vector Classifier Accuracy (before preprocessing): 
Accuracy array Linear SVC:  0.4902784270305022


## The same models with preprocessed data 

In [46]:
df_train_preprocessed = pd.read_csv(os.path.join(cwd, 'preprocessed_train.csv'))

X_train_pre, y_train_pre = df_train_preprocessed['comments'], df_train_preprocessed['subreddits']

#### Tf-Idf

In [47]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf_vectorizer = TfidfVectorizer(max_features=None, smooth_idf=True, sublinear_tf=True)



# Encoder = LabelEncoder()

# y_train_pre = Encoder.fit_transform(y_train_pre)
# #y_test = Encoder.fit_transform(y_test)

# tfidf_vectorizer.fit(X_train_pre)
# X_train_pre = tfidf_vectorizer.transform(X_train_pre)


# #df_test = tfidf_vectorizer.transform(df_test)

#### Count Vectorization

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

X_train_pre = vectorizer.fit_transform(X_train_pre)
Encoder = LabelEncoder()

y_train_pre = Encoder.fit_transform(y_train_pre)

##### Multinomial Naive Bayes

In [50]:
Naive = naive_bayes.MultinomialNB()

x_pre = cross_val_score(Naive, X_train_pre, y_train_pre, cv=5)


print("Naive Bayes Classifier Accuracy (after preprocessing): ")
print("5-fold cross validation average: ", x_pre.mean())

Naive Bayes Classifier Accuracy (after preprocessing): 
5-fold cross validation average:  0.5444792321900545


##### Logistic Regression

In [51]:
logreg = LogisticRegression()
y_pre = cross_val_score(logreg, X_train_pre, y_train_pre, cv=5)

print("Logistic Regression Classifier Accuracy (after preprocessing): ")
print("5-fold cross validation average: ", y_pre.mean())

Logistic Regression Classifier Accuracy (after preprocessing): 
5-fold cross validation average:  0.5238788984927495


##### Stochastic Gradient Descent Classifier

In [52]:
SGD = linear_model.SGDClassifier()
a_pre = cross_val_score(SGD, X_train_pre, y_train_pre, cv=5)

print("Stochastic Gradient Descent Classifier Accuracy (after preprocessing): ")
print("5-fold cross validation average: ", a_pre.mean())

Stochastic Gradient Descent Classifier Accuracy (after preprocessing): 
5-fold cross validation average:  0.5078357474513485


##### Support Vector Classifier

In [54]:
SVC = LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', 
                 fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=10000)
          
svc_pre = cross_val_score(SVC, X_train_pre, y_train_pre, cv=5)

print("Support Vector Classifier Accuracy (after preprocessing): ")
print("Accuracy array Linear SVC: ", svc_pre.mean())  

Support Vector Classifier Accuracy (after preprocessing): 
Accuracy array Linear SVC:  0.49286412804996277
