## Classificazione con sklearn


In [1]:
import pandas as pd
df = pd.read_csv('.../groupbyTOT.csv', delimiter=',', skiprows=0, lineterminator='\n', low_memory=False)
df

Unnamed: 0,date,tweets,value\r
0,2019-04-01,crypto update bitcoin ether ripple xrp bitcoin...,bull\r
1,2019-04-02,enjoy vultr credit deploy masternode high perf...,bull\r
2,2019-04-03,never saw article written strong skeptic minds...,bull\r
3,2019-04-04,apr utc € £ bitcoin btc pic twitter com srliib...,bull\r
4,2019-04-05,x gpu mining frame l h end date thursday may p...,bull\r
...,...,...,...
217,2019-11-19,overall btc still asymetrical trade meaning st...,bear\r
218,2019-11-20,trading bitcoin really helping lot people buil...,bear\r
219,2019-11-21,"khaleesi really kill parents go btc,xtz large ...",bear\r
220,2019-11-22,thebamboo ico crowdsale bitcoin blockchain tok...,bull\r


In [2]:
df.columns = ['date', 'tweet', 'value']
df['value'].replace(to_replace="bull.+", value=1, regex=True, inplace=True)
df['value'].replace(to_replace="bear.+", value=0, regex=True, inplace=True)
df

Unnamed: 0,date,tweet,value
0,2019-04-01,crypto update bitcoin ether ripple xrp bitcoin...,1
1,2019-04-02,enjoy vultr credit deploy masternode high perf...,1
2,2019-04-03,never saw article written strong skeptic minds...,1
3,2019-04-04,apr utc € £ bitcoin btc pic twitter com srliib...,1
4,2019-04-05,x gpu mining frame l h end date thursday may p...,1
...,...,...,...
217,2019-11-19,overall btc still asymetrical trade meaning st...,0
218,2019-11-20,trading bitcoin really helping lot people buil...,0
219,2019-11-21,"khaleesi really kill parents go btc,xtz large ...",0
220,2019-11-22,thebamboo ico crowdsale bitcoin blockchain tok...,1


In [3]:
from sklearn.model_selection import train_test_split

x = df['tweet'].values
y = df['value'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1000)

In [4]:
import numpy as np
sample_idx = 10
y_train_bin = np.asarray(y_train)==y_train[sample_idx]
y_test_bin = np.asarray(y_test)==y_train[sample_idx]
y_train_bin,y_test_bin

(array([False, False, False,  True, False,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True, False,  True,  True,
         True,  True,  True, False, False,  True,  True,  True, False,
         True, False,  True,  True,  True,  True,  True,  True, False,
        False, False, False,  True,  True,  True, False, False, False,
        False, False, False,  True,  True,  True,  True,  True,  True,
        False, False,  True, False, False,  True, False,  True,  True,
         True,  True,  True, False,  True,  True,  True,  True,  True,
        False,  True,  True, False,  True,  True, False, False, False,
         True, False,  True,  True, False,  True,  True,  True, False,
        False,  True,  True,  True, False,  True, False,  True,  True,
         True, False, False, False, False,  True,  True,  True,  True,
        False,  True, False,  True, False,  True,  True,  True, False,
         True, False,  True,  True, False,  True,  True, False,  True,
      

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn import svm

In [6]:
bin_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('sel', SelectKBest(chi2, k=500)),
    ('tfidf', TfidfTransformer()),
    ('learner', LinearSVC())
])

bin_pipeline.fit(x_train,y_train_bin)
bin_predictions = bin_pipeline.predict(x_test)
correct = 0
for prediction,true_label in zip(bin_predictions, y_test_bin):
    if prediction==true_label:
        correct += 1
print(correct/len(bin_predictions))

0.6716417910447762


In [7]:
from sklearn.metrics import confusion_matrix, classification_report
print('Classification report:')
print(classification_report(y_test_bin, bin_predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test_bin, bin_predictions)
print(cm)

Classification report:
              precision    recall  f1-score   support

       False       0.83      0.19      0.31        26
        True       0.66      0.98      0.78        41

    accuracy                           0.67        67
   macro avg       0.74      0.58      0.55        67
weighted avg       0.72      0.67      0.60        67

Confusion matrix:
[[ 5 21]
 [ 1 40]]


In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
param_grid_k = [
    {'sel__k': [1000, 2000, 5000, 'all']}
 ]

opt_pipeline_k = Pipeline([
#    ('vect', CountVectorizer()),  # feature extraction
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])

In [10]:
vect = CountVectorizer(min_df=5)  # tokenization and frequency count

X_train_tok = vect.fit_transform(x_train)

X_test_tok =vect.transform(x_test)

In [11]:
n_jobs = 3
opt_search_k = GridSearchCV(opt_pipeline_k, param_grid_k, cv=5, n_jobs = n_jobs, verbose=True).fit(X_train_tok,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  20 out of  20 | elapsed:   12.1s finished


In [12]:
opt_search_k.best_params_

{'sel__k': 5000}

In [13]:
param_grid_c = [
    {'learner__C': [0.01, 0.1, 1, 10, 100]},
 ]

opt_pipeline_c = Pipeline([
#    ('vect', CountVectorizer()),  # feature extraction
    ('sel', SelectKBest(chi2, k=opt_search_k.best_params_['sel__k'])),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])

In [14]:
opt_search_c = GridSearchCV(opt_pipeline_c, param_grid_c, cv=5, n_jobs = n_jobs, verbose=True).fit(X_train_tok,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:   10.0s finished


In [15]:
opt2_predictions = opt_search_c.best_estimator_.predict(X_test_tok)

correct = 0
for prediction,true_label in zip(opt2_predictions, y_test):
    if prediction==true_label:
        correct += 1
print(correct/len(opt2_predictions))

0.746268656716418


## Inspecting the pipeline

In [16]:
tokenizer = bin_pipeline.named_steps['vect']
selector = bin_pipeline.named_steps['sel']
classifier = bin_pipeline.named_steps['learner']

In [17]:
feature_names = tokenizer.get_feature_names()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(selector.get_support(),selector.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score)
len(feats_w_score)

538931

In [18]:
#the 100 less and most informative features
feats_w_score[:100],feats_w_score[-100:]

([(4.121395749726103e-30, False, 'statistical'),
  (7.397478347567422e-07, False, 'beers'),
  (8.027756771825635e-07, False, 'lgbs'),
  (2.05768937944116e-06, False, 'conspiracies'),
  (2.05768937944116e-06, False, 'doh'),
  (2.05768937944116e-06, False, 'gmer'),
  (2.05768937944116e-06, False, 'hyundai'),
  (2.05768937944116e-06, False, 'iefun'),
  (2.05768937944116e-06, False, 'unnecessarily'),
  (2.579524150029678e-06, False, 'staying'),
  (2.632632000169499e-06, False, 'dolarlık'),
  (2.632632000169499e-06, False, 'fenerbahce'),
  (2.632632000169499e-06, False, 'imported'),
  (2.632632000169499e-06, False, 'jonny'),
  (2.632632000169499e-06, False, 'lng'),
  (2.632632000169499e-06, False, 'progressed'),
  (2.632632000169499e-06, False, 'rethinking'),
  (2.632632000169499e-06, False, 'southsanfrancisco'),
  (2.632632000169499e-06, False, 'whine'),
  (3.1345735099351286e-06, False, 'renewed'),
  (3.886436385593956e-06, False, 'threshold'),
  (4.11537875888232e-06, False, 'conservativ

In [19]:
feats_w_classifier_weight = list()
for index,weight in enumerate(selector.inverse_transform(classifier.coef_)[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

500

These are the feature that most contribute to a positive decision

In [20]:
#the feature that most contribute to a positive decision
feats_w_classifier_weight[-100:]

[(0.07814908208740697, 'atlas'),
 (0.07869496334223622, 'dorsey'),
 (0.07907373438088106, 'opera'),
 (0.07935799874103026, 'luxury'),
 (0.07936689011721212, 'pizzaday'),
 (0.08229086044966578, 'tls'),
 (0.08245184840829001, 'statum'),
 (0.08245296291385991, 'drife'),
 (0.08253620790345849, 'blessings'),
 (0.0833018173421039, 'crazy'),
 (0.08351072985467341, 'script'),
 (0.08492450753453473, 'jesusisking'),
 (0.0863349676180379, 'btg'),
 (0.0872863970087265, 'tessline'),
 (0.08762599061389782, 'viet'),
 (0.08831147029312174, 'fashion'),
 (0.08847162043230689, 'exclusive'),
 (0.08869544886628569, 'schiff'),
 (0.08878823362288864, 'twitter'),
 (0.08958994907592432, 'ovc'),
 (0.09145203804516183, 'tokensnet'),
 (0.09146065974440601, 'surges'),
 (0.09158301347972007, 'nash'),
 (0.09431739868370809, 'dapp'),
 (0.09493541158802678, 'parabolic'),
 (0.09710610744913414, 'fomo'),
 (0.0974366854587402, 'gs'),
 (0.09915585587235921, 'fresh'),
 (0.10029449439162216, 'accouncing'),
 (0.1012951657739

In [21]:
#the features that most contribute to a negative decision.
feats_w_classifier_weight[:100]

[(-1.4785522166645215, 'hereistitle'),
 (-0.8655617425286473, 'cryptocurrency'),
 (-0.5840004519422687, 'free'),
 (-0.5814547811420195, 'get'),
 (-0.5757913189574438, 'account'),
 (-0.5014926349937073, 'jul'),
 (-0.4510615024210688, 'every'),
 (-0.44334208125026137, 'average'),
 (-0.416529546502967, 'procoin'),
 (-0.41598907845751726, 'nov'),
 (-0.4070561527301387, 'volume'),
 (-0.40432035847103853, 'airdrop'),
 (-0.3850097342385612, 'btcnews'),
 (-0.37757882205143267, 'bitcome'),
 (-0.3748806200653926, 'crypto'),
 (-0.3722607147231822, 'november'),
 (-0.35331092978140194, 'join'),
 (-0.35195619486379304, 'kingdom'),
 (-0.3384156434387671, 'time'),
 (-0.3362764150359323, 'dacx'),
 (-0.3308529447961845, 'duvets'),
 (-0.3227246112859075, 'careers'),
 (-0.3217238346164694, 'jobs'),
 (-0.3198052859657752, 'cryptonews'),
 (-0.30511155570411164, 'automatically'),
 (-0.2987330971432941, 'link'),
 (-0.2982385758938793, 'hiring'),
 (-0.29251665488270795, 'sleep'),
 (-0.2921696572098773, 'busine

### Naive Bayes

In [25]:
nb_bin_pipeline = Pipeline([
    ('vect', CountVectorizer()),  # feature extraction
    ('sel', SelectKBest(chi2, k=5000)),  # feature selection
    ('learner', MultinomialNB())  # learning algorithm
])

nb_bin_pipeline.fit(x_train,y_train_bin)
bin_predictions = nb_bin_pipeline.predict(x_test)

print('Classification report:')
print(classification_report(y_test_bin, bin_predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test_bin, bin_predictions)
print(cm)

Classification report:
              precision    recall  f1-score   support

       False       0.56      0.77      0.65        26
        True       0.81      0.61      0.69        41

    accuracy                           0.67        67
   macro avg       0.68      0.69      0.67        67
weighted avg       0.71      0.67      0.68        67

Confusion matrix:
[[20  6]
 [16 25]]


In [26]:
tokenizer = nb_bin_pipeline.named_steps['vect']
selector = nb_bin_pipeline.named_steps['sel']
classifier = nb_bin_pipeline.named_steps['learner']


In [27]:
classifier.class_log_prior_,classifier.feature_log_prob_, len(classifier.feature_log_prob_[0])

(array([-1.00037385, -0.45845764]),
 array([[-14.35715957, -13.32106764, -12.09161575, ..., -13.22869432,
         -13.25854728, -13.22869432],
        [-11.82676442, -17.19740245, -14.3641891 , ..., -16.50425527,
         -17.19740245, -17.19740245]]),
 5000)

In [28]:
ratio = classifier.feature_log_prob_[0]/classifier.feature_log_prob_[1]

In [29]:
feats_w_classifier_weight = list()
feature_names = tokenizer.get_feature_names()
for index,weight in enumerate(selector.inverse_transform([ratio])[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

5000

In [30]:
#the most relevant features for a positive decision
feats_w_classifier_weight[-100::-1]


 (1.1833633735455515, 'authorizes'),
 (1.1821601709720189, 'obliterates'),
 (1.1819902738481574, 'htlcs'),
 (1.1817751357932482, 'ابو'),
 (1.1807687827432403, 'qpocket'),
 (1.1806962478530934, 'megapump'),
 (1.179957366964322, 'themovie'),
 (1.1799062429190355, 'myob'),
 (1.1785496204593957, 'costumer'),
 (1.1785104016069712, 'dampens'),
 (1.1778226698440852, 'sleuths'),
 (1.1777497663994516, '以太坊'),
 (1.1777112126878033, 'déjà'),
 (1.1772810675148238, 'livestreams'),
 (1.1772164574111554, 'breez'),
 (1.1769400375962347, '区块链'),
 (1.1767980139716225, 'xmb'),
 (1.1767980139716225, 'fbcampaign'),
 (1.1765915194089014, 'yobi'),
 (1.1764563881895538, 'bmo'),
 (1.1761444265318266, 'ftykjevlrzhhxxmwsfrxscxtgb'),
 (1.1757500520198771, 'destructing'),
 (1.1753604750881441, 'mcc'),
 (1.1750393213546428, 'yoongi'),
 (1.1747448122723856, 'tripped'),
 (1.1747448122723856, 'nathaniel'),
 (1.1744044014049857, 'auburn'),
 (1.1741513123120524, 'webs'),
 (1.1739471905128995, 'eck'),
 (1.17386597622524

In [31]:
#the most relevat features for a negative decision.
feats_w_classifier_weight[:100]

[(0.4197231703928112, 'hereistitle'),
 (0.5138021500024011, 'duvets'),
 (0.5625059612856873, 'procoin'),
 (0.6063193606206161, 'القدر'),
 (0.629262176454155, 'pahoo'),
 (0.6355061333233503, 'pariss'),
 (0.6403322644683513, 'gbiarritz'),
 (0.6407062099144417, 'phcserialkiller'),
 (0.6408940883217231, 'totliv'),
 (0.6466247022340429, 'protectphgirls'),
 (0.6470415378288358, 'ليفربول'),
 (0.6485244532218408, 'internationalmensday'),
 (0.648739419516451, 'laylatulqadr'),
 (0.6529824451759424, 'الفطر'),
 (0.6568125250489909, 'getwellsoonscoups'),
 (0.6682038610042477, 'btchub'),
 (0.6704098599446526, 'everincreasing'),
 (0.6756401984757803, 'gfrance'),
 (0.6784430695466621, 'amberliuisoverparty'),
 (0.6789751233122626, 'worldtoiletday'),
 (0.6798968531793904, 'lifeofahustler'),
 (0.6804064599814782, 'عيد'),
 (0.6806376060817779, 'kepa'),
 (0.6817667350833807, 'australiatalks'),
 (0.6825318555494936, 'emergencyinjnu'),
 (0.6826877307614618, 'chickenrepublic'),
 (0.6833747482443682, 'توتنهام'