In [57]:
import numpy as np
import pandas as pd
import os
import re

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

In [3]:
df = pd.read_csv('/Users/anton/mywork/Datasets/Quora/train.csv')

In [4]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [5]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_valid_index in split.split(df, df.target):
    train_set = df.iloc[train_index]
    test_valid_set = df.iloc[test_valid_index]

split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for test_index, valid_index in split2.split(test_valid_set, test_valid_set.target):
    test_set = test_valid_set.iloc[test_index]
    valid_set = test_valid_set.iloc[valid_index]

In [6]:
print("Train set", train_set.shape[0], sum(train_set.target) / train_set.shape[0])
print("Valid set", valid_set.shape[0], sum(valid_set.target) / valid_set.shape[0])
print("Test set", test_set.shape[0], sum(test_set.target) / test_set.shape[0])

Train set 1044897 0.06187021304492213
Valid set 130613 0.06186979856522704
Test set 130612 0.061870272256760485


In [7]:
vectorizer = HashingVectorizer(n_features=2**15, norm=None, alternate_sign=False)
X_train_transformed = vectorizer.fit_transform(train_set.question_text)
y_train = train_set.target

In [8]:
X_valid_transformed = vectorizer.transform(valid_set.question_text)
y_valid = valid_set.target

In [9]:
log_clf = LogisticRegression(random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_valid_transformed)

In [10]:
precision_recall_fscore_support(y_valid, y_pred)

(array([0.96363202, 0.69425242]),
 array([0.98736657, 0.43497092]),
 array([0.97535492, 0.5348448 ]),
 array([122532,   8081]))

In [91]:
y_train_pred = log_clf.predict(X_train_transformed)
precision_recall_fscore_support(y_train, y_train_pred)

(array([0.96629617, 0.74322565]),
 array([0.98913439, 0.47687477]),
 array([0.97758191, 0.58097769]),
 array([980249,  64648]))

In [29]:
train_set.question_text.iloc[0]

'Which is the best career after 12th?'

In [30]:
print(X_train_transformed[0])

  (0, 12071)	1.0
  (0, 12119)	1.0
  (0, 13677)	1.0
  (0, 16393)	1.0
  (0, 19280)	1.0
  (0, 24734)	1.0
  (0, 29201)	1.0


In [31]:
test_text = 'Which is the best career after 12th?'
print(vectorizer.transform([test_text]))

  (0, 12071)	1.0
  (0, 12119)	1.0
  (0, 13677)	1.0
  (0, 16393)	1.0
  (0, 19280)	1.0
  (0, 24734)	1.0
  (0, 29201)	1.0


In [70]:
test_text = 'career'
t = vectorizer.transform([test_text])
t.indices[0]

19280

In [44]:
log_clf.coef_[0].shape

(32768,)

In [62]:
test_text = 'Which is the best career after 12th?'
re.findall(r'\b\w\w+\b', test_text)

['Which', 'is', 'the', 'best', 'career', 'after', '12th']

In [75]:
words = set([word.lower() for question in train_set.question_text for word in re.findall(r'\b\w\w+\b', question)])

In [76]:
len(words)

173236

In [77]:
m = {}
for word in words:
    transformed = vectorizer.transform([word])
    if transformed.nnz == 1:
        code = transformed.indices[0]
        if code in m:
            m[code] += ',' + word
        else:
            m[code] = word


In [89]:
for idx in np.flip(np.argsort(log_clf.coef_[0]), axis=0)[:30]:
    print(log_clf.coef_[0][idx], m[idx])

5.117441879213295 hade,castrate,withopen,manolas,phoney,carradine,guesser
4.694228376341662 swarup,castrated,pastes,vellore,aranmula
4.503698248897159 cuz,aneurysim,castration
3.8407321608565717 alexandrino,alabamians,aball
3.4272882367062403 incest
3.3244785779910124 fuck,000usd,kolawa,giblatar,pgdbp,ch3ch2chclcooh,systemizer,whivh,yeshua,trumo,mingshan,sjit
3.3237673731705164 moron,webdevlopment,gepi,mobirise,hie,peaple,hornell
3.1490989959571882 quicksgear,dugish,farragut,manslamming,idiots
3.1218573181241736 multifamily,nwea,abramoviches,asshole,bildt,hunduran,98y,tiphares
3.0963842792007257 drmkpro64,cunt,makemake
2.915995283958546 glancation,rejuvalex,helter,potencial,givingi,tsukuba,smirking
2.8498924275472715 butyne,neovascularization,accupuncture,kothaguda,assholes,bengald
2.7952521318883967 drumpf,enchilada,arras
2.7385796130849775 leucoderma,chaturthi,bullshit,softwaretraining,filth,joads
2.725068195953779 phoshoglycerids,ucsi,399,3164,leno,zing,fucking,crimimal
2.7057595361

In [90]:
for idx in np.argsort(log_clf.coef_[0])[:30]:
    print(log_clf.coef_[0][idx], m[idx])

-3.186465706957477 computerize,beechview,illegalised,shilphata,vit
-2.4103820103805615 interconnection,combi,ninian,cmc,cerelac,unranked,shilometer,sechin,subconcussive
-2.1761040625958286 silkiest,parsifal,yallapana,brahmputra,ssc,sulfur
-2.0206153505792503 chall,unforced,thr,marries
-2.015232337781107 compuksary,salon,cloudy,1896,keyboarding
-1.890202610903577 doofinshmirtz,muskmelon,unitec,goldmans,christmass,armor,salesfourse,socialblade,nucleàr,trix,bluntly,flic
-1.8868976675551077 zonnie,fatherland
-1.8814598453455995 nayudu,pentel,simen,junction,bery,oxide,munaf,camus,scraping
-1.866323980785272 kongregate,ummm,pds,speakeasy,remained,akif,sakte
-1.8306883521848019 lpu,hadal,foreskin,counciling,engeniar,candies
-1.804112033001717 mcnp,pinewood,kornienko,insteresting,v5,advertising,impotance,duckhead
-1.791274573831265 penitents,pcs,torrents,tack,sprinkled,bacteroidetes
-1.7814013582124546 scarves,multiplied,bhatnagar,temba,60le650,esop
-1.776453480664684 yellowknife,111111,overse

In [96]:
X_train_transformed

<1044897x32768 sparse matrix of type '<class 'numpy.float64'>'
	with 12077247 stored elements in Compressed Sparse Row format>

In [94]:
X_train_transformed.nnz

12077247

Roughly 120MB