In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import *
from sklearn.pipeline import Pipeline

In [2]:
def make_dfs(df, label):
    samp=df.sample(frac=1)[:100000]
    texts=[]
    labels=[]
    for i in samp.ngram:
        texts.append(i)
        labels.append(str(label))
    data=pd.DataFrame({'label':labels, 'text':texts})
    return data

In [3]:
hist=pd.read_excel('/media/sf_communalflat/ihatelinguistics/history_collocation_counts.xlsx', 'sixgrams')
ling=pd.read_excel('/media/sf_communalflat/ihatelinguistics/linguistics_collocation_counts.xlsx', 'sixgrams')
pol=pd.read_excel('/media/sf_communalflat/ihatelinguistics/politology_collocation_counts.xlsx', 'sixgrams')
psy=pd.read_excel('/media/sf_communalflat/ihatelinguistics/psychology_and_pedagogics_collocation_counts.xlsx', 'sixgrams')
law=pd.read_excel('/media/sf_communalflat/ihatelinguistics/law_collocation_counts.xlsx', 'sixgrams')
ec=pd.read_excel('/media/sf_communalflat/ihatelinguistics/economics_collocation_counts.xlsx', 'sixgrams')
soc=pd.read_excel('/media/sf_communalflat/ihatelinguistics/sociology_collocation_counts.xlsx', 'sixgrams')

In [4]:
all_data=pd.concat([make_dfs(hist, 'hist'), make_dfs(ling, 'ling'),make_dfs(pol, 'pol'),make_dfs(psy, 'psy'),make_dfs(law, 'law'),make_dfs(ec, 'ec'),make_dfs(soc, 'soc')])

In [5]:
texts_train, texts_test, labels_train, labels_test = train_test_split(all_data.text, all_data.label, test_size=0.2)
print(len(texts_train), len(texts_test), len(labels_train), len(labels_test))

482021 120506 482021 120506


In [31]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3))
X_train = vectorizer.fit_transform(texts_train)
print("X_train shape:", X_train.shape)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


X_train shape: (482021, 1491566)


In [32]:
X_test = vectorizer.transform(texts_test)
print("X_test shape:", X_test.shape)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


X_test shape: (120506, 1491566)


In [8]:
svc_clf = LinearSVC()
svc_clf.fit(X_train, labels_train)
svc_labels_pred = svc_clf.predict(X_test)
print(classification_report(y_pred=svc_labels_pred, y_true=labels_test))

             precision    recall  f1-score   support

         ec       0.93      0.94      0.93     19924
       hist       0.97      0.95      0.96      5538
        law       0.94      0.93      0.94     19977
       ling       0.95      0.97      0.96     15060
        pol       0.93      0.91      0.92     19990
        psy       0.93      0.94      0.93     19947
        soc       0.94      0.94      0.94     20070

avg / total       0.94      0.94      0.94    120506



In [9]:
clf = MultinomialNB()
clf.fit(X_train, labels_train)
labels_pred = clf.predict(X_test)
print(classification_report(y_pred=labels_pred, y_true=labels_test))

             precision    recall  f1-score   support

         ec       0.85      0.90      0.87     19924
       hist       1.00      0.35      0.52      5538
        law       0.87      0.89      0.88     19977
       ling       0.95      0.87      0.91     15060
        pol       0.85      0.86      0.85     19990
        psy       0.86      0.91      0.88     19947
        soc       0.82      0.89      0.86     20070

avg / total       0.87      0.86      0.86    120506



In [10]:
clf = LogisticRegression()
clf.fit(X_train, labels_train)
labels_pred = clf.predict(X_test)
print(classification_report(y_pred=labels_pred, y_true=labels_test))

             precision    recall  f1-score   support

         ec       0.81      0.85      0.83     19924
       hist       0.95      0.63      0.76      5538
        law       0.83      0.82      0.82     19977
       ling       0.87      0.86      0.87     15060
        pol       0.80      0.78      0.79     19990
        psy       0.80      0.84      0.82     19947
        soc       0.79      0.80      0.79     20070

avg / total       0.82      0.82      0.82    120506



In [11]:
import pickle
pickle.dump(svc_clf, open('domain_model.sav', 'wb'))

In [37]:
tfidf_svc = Pipeline([('tfidf', vectorizer), ('svc', svc_clf)])

In [38]:
tfidf_svc.fit(texts_train, labels_train)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
 ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [40]:
from sklearn.externals import joblib
joblib.dump(tfidf_svc, 'domain_model.pkl')

['domain_model.pkl']