In [1]:
# importing different libraries
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
import xgboost as xgb

from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stop_words = stopwords.words('english')

In [2]:
# loading our dataset
train_df = pd.read_csv('../input/midas-task/reddit_data.csv')

# dropping rows having null values
train_df.dropna(inplace=True)

# creating a label column to encode our text labels to no.
le = preprocessing.LabelEncoder()
le.fit(train_df["flair"])
train_df["label"] = le.transform(train_df["flair"])
train_df.head()

Unnamed: 0,text,flair,dirty_text,label
0,top comments toi article drop us oil prices,Non-Political,Top comments on a TOI article about the drop i...,3
1,disappointed,Politics,Disappointed,5
2,hacking networking security 2 books 1 hacking ...,Non-Political,Hacking: Networking and Security (2 Books in 1...,3
3,zakir khan irfan junejo live instagram session...,Non-Political,Zakir Khan and Irfan Junejo live Instagram Ses...,3
4,cursing quentin tarantino movie,Non-Political,Cursing In A Quentin Tarantino Movie,3


In [3]:
# Splitting the data and using "dirty_text" for training
xtrain, xvalid, ytrain, yvalid = train_test_split(train_df.dirty_text, train_df.label, 
                                                  stratify=train_df.label, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [4]:
# function for tokenization, we are also using stemming to reduce no. of unique tokens
# And we are using Porter Stemmer from NLTK for stemming
def tokenize(text):
    tokens = word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [5]:
# Initialising the TF-IDF Vectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=None, tokenizer = tokenize,
                    strip_accents='unicode', analyzer='word',token_pattern=None,
                    ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)
# We have not used stopwords argument to remove stopwords in tfidf because the text which we are using are title
# of posts and because of this the no. of words are itself less so, there is no point in reducing no. of words further. 

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))

# Now transforming to TF-IDF Vectors
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

# saving the trained vectorizer model
filename = 'tfidf_vectors.sav'
joblib.dump(tfv, filename)

['tfidf_vectors.sav']

In [6]:
# Initialising the Count Vectorizer
ctv = CountVectorizer(analyzer='word',tokenizer = tokenize, ngram_range=(1, 3))
# Same as TF-IDF Vectorizer, here also we have not removed stopwords

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))

#Now transforming to Count Vectors
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

# saving the trained vectorizer model
filename = 'count_vectors.sav'
joblib.dump(ctv, filename)



['count_vectors.sav']

In [7]:
svd = TruncatedSVD(n_components=200)

# Fitting the TF-IDF vectors
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

# saving the SVD model
filename = 'trun_svd.sav'
joblib.dump(svd, filename)

# saving the Scaler model
filename = 'scaler.sav'
joblib.dump(scl, filename)

['scaler.sav']

In [8]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict(xvalid_svd)

print('accuracy %s' % accuracy_score(yvalid, predictions))

joblib.dump(clf, "xgboost_svd.sav")

accuracy 0.5773035887487876


['xgboost_svd.sav']

In [9]:
# clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
#                         subsample=0.8, nthread=10, learning_rate=0.1)
# clf.fit(xtrain_svd_scl, ytrain)
# predictions = clf.predict(xtrain_svd_scl)

# print('accuracy %s' % accuracy_score(yvalid, predictions))

# joblib.dump(clf, "xgboost_svd_scl.sav")

In [10]:
# import joblib
# joblib.dump(clf, "xgboost.sav")

In [11]:
clf = xgb.XGBClassifier(nthread=10)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict(xtrain)

print('accuracy %s' % accuracy_score(yvalid, predictions))

joblib.dump(clf, "xgboost_svd_wpt.sav")

TypeError: can not initialize DMatrix from Series

In [12]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict(xvalid_tfv.tocsc())

print('accuracy %s' % accuracy_score(yvalid, predictions))

joblib.dump(clf, "xgboost_tfv.sav")

accuracy 0.5888134497251859


['xgboost_tfv.sav']

In [13]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict(xvalid_ctv.tocsc())

print('accuracy %s' % accuracy_score(yvalid, predictions))

joblib.dump(clf, "xgboost_ctv.sav")

accuracy 0.5908179760750081


['xgboost_ctv.sav']

In [14]:
# xgb.XGBClassifier??