In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from sklearn import metrics

#import numpy as np
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import StandardScaler

path='dataset//reddit_train.csv'

In [2]:
data = pd.read_csv(path)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data['comments'], data['subreddits'], train_size=0.7, test_size=0.3)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
tf_idf_vectorizer = TfidfVectorizer(sublinear_tf = True)

vectors_train_idf = tf_idf_vectorizer.fit_transform(X_train)
vectors_test_idf = tf_idf_vectorizer.transform(X_test)

In [5]:
print(vectors_train_idf.shape)

(49000, 62041)


# find the best classifier for each class

In [6]:
###MultinomialNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
MNB_clf = MultinomialNB(alpha=.2)
MNB_clf.fit(vectors_train_idf, y_train)
y_pred=MNB_clf.predict(vectors_test_idf)
mnb_report = metrics.classification_report(y_test,y_pred,output_dict=True)

print("test accuracy: %0.2f%%" %((y_pred==y_test).astype(np.int).mean()*100))

test accuracy: 56.45%


In [7]:
from sklearn.svm import LinearSVC
svc_clf = LinearSVC(C=0.2)

svc_clf.fit(vectors_train_idf, y_train)
y_pred_svc=svc_clf.predict(vectors_test_idf)
svc_report = metrics.classification_report(y_test,y_pred_svc,output_dict=True)

print("test accuracy: %0.2f%%" %((y_pred_svc==y_test).astype(np.int).mean()*100))

test accuracy: 55.87%


In [8]:
from sklearn.linear_model import SGDClassifier

lsvm = SGDClassifier(average=True, alpha=0.00002, max_iter=20, tol=None)

lsvm.fit(vectors_train_idf, y_train)
y_pred_lsvm=lsvm.predict(vectors_test_idf)
lsvm_report = metrics.classification_report(y_test,y_pred_lsvm,output_dict=True)

print("test accuracy: %0.2f%%" %((y_pred_lsvm==y_test).astype(np.int).mean()*100))

test accuracy: 55.99%


In [9]:
def select(label_a,label_b,label_d):
    if label_b==label_d:
        a = mnb_report[label_a]['precision']
        b = svc_report[label_b]['precision']
        d = lsvm_report[label_d]['precision']
        if a == max(a,b,d):
            return label_a
        elif b == max(a,b,d):
            return label_b
        else:
            return label_d
    else:
        return label_a
    
y_pred_final = np.copy(y_pred)

for i in range(len(y_pred)):
    y_pred_final[i] = select(y_pred[i], y_pred_svc[i],y_pred_lsvm[i])
    
print("test accuracy: %0.2f%%" %((y_pred_final==y_test).astype(np.int).mean()*100))


test accuracy: 57.10%


# Ensemble

In [10]:
path='dataset//reddit_train.csv'
path_test='dataset//reddit_test.csv'
labels=['Overwatch','Music','conspiracy','AskReddit','worldnews','anime','hockey','baseball','funny','nba',
    'canada','soccer','nfl','trees','gameofthrones','wow','leagueoflegends','GlobalOffensive','europe','movies']

data = pd.read_csv(path)
data_test = pd.read_csv(path_test)

X_train=data['comments']
y_train=data['subreddits']
X_test=data_test['comments']

In [11]:
tf_idf_vectorizer = TfidfVectorizer(sublinear_tf = True)

vectors_train_idf = tf_idf_vectorizer.fit_transform(X_train)
vectors_test_idf = tf_idf_vectorizer.transform(X_test)

In [12]:
print(vectors_train_idf.shape)
print(vectors_test_idf.shape)

(70000, 74265)
(30000, 74265)


In [13]:
MNB_clf = MultinomialNB(alpha=.22)
MNB_clf.fit(vectors_train_idf, y_train)
y_pred=MNB_clf.predict(vectors_test_idf)

svc_clf = LinearSVC(C=0.2)
svc_clf.fit(vectors_train_idf, y_train)
y_pred_svc=svc_clf.predict(vectors_test_idf)

lsvm = SGDClassifier(average=True, alpha=0.00002, max_iter=20, tol=None)
lsvm.fit(vectors_train_idf, y_train)
y_pred_lsvm=lsvm.predict(vectors_test_idf)

In [14]:
def select(label_a,label_b,label_d):
    if label_b==label_d:
        a = mnb_report[label_a]['precision']
        b = svc_report[label_b]['precision']
        d = lsvm_report[label_d]['precision']
        if a == max(a,b,d):
            return label_a
        elif b == max(a,b,d):
            return label_b
        else:
            return label_d
    else:
        return label_a
    
y_pred_final = np.copy(y_pred)

for i in range(len(y_pred)):
    y_pred_final[i] = select(y_pred[i], y_pred_svc[i],y_pred_lsvm[i])


In [None]:
savepath="test_launch_kaggle.csv"
lens=len(y_pred_final)
    
for i in range(lens):
    with open(savepath, 'a+', newline='') as csvfile:      
        csv_write = csv.writer(csvfile)
        csv_write.writerows([[y_pred_final[i]]])    
f = open(savepath)
csv_read = csv.reader(f)
for line in csv_read:                
    print(line)

In [None]:
readpath=savepath
writepath="dataset//reddit_test.csv"
read_data = pd.read_csv(readpath, header=None, names =["Category"])
write_data = pd.read_csv(writepath,usecols=['id'])

write_data["Category"] = read_data["Category"] 
pt="reddit_test_kaggle.csv"
write_data.to_csv(pt, mode = 'w',index =False)