In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [2]:
import matplotlib.pyplot as plt
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
from scipy.sparse import hstack

In [5]:
df_x = pd.read_csv("Data/df_x_nb4a-mis.csv", index_col = 0)
print(df_x.shape)

(171634, 34)


In [6]:
df_x['textlower'] = df_x['textlower'].fillna('')
df_x['message_dt'] = pd.to_datetime(df_x["message_dt"], format='%Y-%m-%d %H:%M:00')

### Text similarity imports/functions

In [7]:
import string, re, unidecode

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
stemmer = SnowballStemmer('spanish')
remove_punc = str.maketrans(string.punctuation, len(string.punctuation) * " ")
stopwords_ascii = [unidecode.unidecode(w) for w in stopwords.words('spanish')]

def tokenize(s):
    s = unidecode.unidecode(s)
    s = s.translate(remove_punc)
    tokens = nltk.word_tokenize(s)
    filtered = [w for w in tokens if w not in stopwords_ascii]
    return [stemmer.stem(w) for w in filtered]

In [9]:
def dummy(x):
    return x

vectorizer = TfidfVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy,
                            token_pattern=None)

In [10]:
scaler = StandardScaler()

# Pre-processing

In [11]:
df_x['token'] = df_x['textlower'].apply(tokenize)

In [12]:
df_x['token_length'] = df_x['token'].apply(len)
df_x = df_x[df_x['token_length'] >= 5]

In [13]:
print(df_x.iloc[int(df_x.shape[0] * 0.8)]['message_dt'])

2020-03-28 13:48:00


In [14]:
df_test = df_x[df_x['message_dt'] > pd.Timestamp(2020, 3, 28, 13, 48)]
df_train = df_x[df_x['message_dt'] <= pd.Timestamp(2020, 3, 28, 13, 48)]
print(df_test.shape)
print(df_train.shape)

(8800, 36)
(35225, 36)


# Properties of scams

# CV

In [16]:
print(df_train.iloc[int(df_train.shape[0] * 0.2)]['message_dt'])
print(df_train.iloc[int(df_train.shape[0] * 0.4)]['message_dt'])
print(df_train.iloc[int(df_train.shape[0] * 0.6)]['message_dt'])
print(df_train.iloc[int(df_train.shape[0] * 0.8)]['message_dt'])

2020-02-22 10:00:00
2020-03-03 13:21:00
2020-03-13 11:47:00
2020-03-21 11:57:00


In [18]:
# Returns (recall, precision)
def rec_pre(trn, tst, clf):
    X_trn = vectorizer.fit_transform(trn['token'])
    X_tst = vectorizer.transform(tst['token'])

    clf.fit(X_trn, trn['scam'])

    tn, fp, fn, tp = confusion_matrix(tst['scam'], clf.predict(X_tst)).ravel()
    return tp / (tp + fn), tp / (tp + fp)

In [19]:
# Returns recall list, precision list

def forward_chain(clf, fn = rec_pre):
    stat = []
    
    # Using 0-20% of training, testing on 20-40%
    trn = df_train[df_train['message_dt'] <= pd.Timestamp(2020, 2, 22, 10)]
    tst = df_train[(df_train['message_dt'] > pd.Timestamp(2020, 2, 22, 10)) \
                      & (df_train['message_dt'] <= pd.Timestamp(2020, 3, 3, 13, 21))]
    
    stat.append(fn(trn, tst, clf))
    
    # Using 0-40% of training, testing on 40-60%
    trn = df_train[df_train['message_dt'] <= pd.Timestamp(2020, 3, 3, 13, 21)]
    tst = df_train[(df_train['message_dt'] > pd.Timestamp(2020, 3, 3, 13, 21)) \
                      & (df_train['message_dt'] <= pd.Timestamp(2020, 3, 13, 11, 47))]

    stat.append(fn(trn, tst, clf))
    
    # Using 0-60% of training, testing on 60-80%
    trn = df_train[df_train['message_dt'] <= pd.Timestamp(2020, 3, 13, 11, 47)]
    tst = df_train[(df_train['message_dt'] > pd.Timestamp(2020, 3, 13, 11, 47)) \
                      & (df_train['message_dt'] <= pd.Timestamp(2020, 3, 21, 11, 57))]
        
    stat.append(fn(trn, tst, clf))
    
    # Using 0-80% of training, testing on 80-100%
    trn = df_train[df_train['message_dt'] <= pd.Timestamp(2020, 3, 21, 11, 57)]
    tst = df_train[df_train['message_dt'] > pd.Timestamp(2020, 3, 21, 11, 57)]
        
    stat.append(fn(trn, tst, clf))
    
    return [s[0] for s in stat], [s[1] for s in stat]

# Tokens Only

### Logistic regression

In [20]:
#Too strong regularization results in not PREDICTING any positives. Specifically:

for c in [0.5, 0.6, 0.7, 0.8, 0.9]:
    clf = LogisticRegression(C = c)
    fc = forward_chain(clf)
    print(c, fc[0], fc[1])



0.5 [0.0, 0.026785714285714284, 0.03977272727272727, 0.2109375] [nan, 1.0, 1.0, 1.0]
0.6 [0.08247422680412371, 0.08928571428571429, 0.05113636363636364, 0.2109375] [1.0, 1.0, 1.0, 1.0]
0.7 [0.09278350515463918, 0.11607142857142858, 0.05113636363636364, 0.2109375] [1.0, 1.0, 1.0, 1.0]
0.8 [0.09278350515463918, 0.13392857142857142, 0.056818181818181816, 0.2109375] [1.0, 1.0, 1.0, 1.0]
0.9 [0.12371134020618557, 0.13392857142857142, 0.056818181818181816, 0.21875] [1.0, 1.0, 1.0, 1.0]


In [22]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for c in [0.7, 0.85]:
    print("%s..." % c, end = '')
    clf = LogisticRegression(C = c)
    rec, pre = forward_chain(clf)
    ax1.plot([1, 2, 3, 4], rec, label = c)
    ax2.plot([1, 2, 3, 4], pre, label = c)
    
for i in range(0, 5):
    c = 10**i
    print("%s..." % i, end = '')
    clf = LogisticRegression(C = c)
    rec, pre = forward_chain(clf)
    ax1.plot([1, 2, 3, 4], rec, label = "10^%s" % i)
    ax2.plot([1, 2, 3, 4], pre, label = "10^%s" % i)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("Logistic Regression with Varying C ($L_2$ Regularization)")
plt.savefig('images/ch-misinformation/ml_log.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

0.7...0.85...0...1...2...3...4...

In [23]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for c in [0.7, 0.85]:
    print("%s..." % c, end = '')
    clf = LogisticRegression(penalty = 'l1', C = c, solver = 'liblinear')
    rec, pre = forward_chain(clf)
    ax1.plot([1, 2, 3, 4], rec, label = c)
    ax2.plot([1, 2, 3, 4], pre, label = c)
    
for i in range(0, 5):
    c = 10**i
    print("%s..." % i, end = '')
    clf = LogisticRegression(penalty = 'l1', C = c, solver = 'liblinear')
    rec, pre = forward_chain(clf)
    ax1.plot([1, 2, 3, 4], rec, label = "10^%s" % i)
    ax2.plot([1, 2, 3, 4], pre, label = "10^%s" % i)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("Logistic Regression with Varying C ($L_1$ Regularization)")
plt.savefig('images/ch-misinformation/ml_log_l1.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

0.7...0.85...0...1...2...3...4...

### SVM

In [24]:
#Too strong regularization results in not PREDICTING any positives. Specifically:

for i in range(-5, 2):
    c = 10**i
    clf = SVC(C = c)
    fc = forward_chain(clf)
    print(i, fc[0], fc[1])



-5 [0.0, 0.0, 0.0, 0.0] [nan, nan, nan, nan]




-4 [0.0, 0.0, 0.0, 0.0] [nan, nan, nan, nan]




-3 [0.0, 0.0, 0.0, 0.0] [nan, nan, nan, nan]




-2 [0.0, 0.0, 0.0, 0.0] [nan, nan, nan, nan]




-1 [0.0, 0.044642857142857144, 0.03409090909090909, 0.15234375] [nan, 1.0, 1.0, 1.0]
0 [0.4329896907216495, 0.44642857142857145, 0.8181818181818182, 0.30859375] [0.9545454545454546, 0.9803921568627451, 1.0, 0.9875]
1 [0.4536082474226804, 0.49107142857142855, 0.8181818181818182, 0.31640625] [0.9565217391304348, 0.9821428571428571, 1.0, 0.9878048780487805]


In [25]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for c in [0.25, 0.5, 0.75]:
    print("%s..." % c, end = '')
    clf = SVC(C = c)
    rec, pre = forward_chain(clf)
    ax1.plot([1, 2, 3, 4], rec, label = c)
    ax2.plot([1, 2, 3, 4], pre, label = c)
    
for i in range(0, 3):
    c = 10**i
    print("%s..." % i, end = '')
    clf = SVC(C = c)
    rec, pre = forward_chain(clf)
    ax1.plot([1, 2, 3, 4], rec, label = "10^%s" % i)
    ax2.plot([1, 2, 3, 4], pre, label = "10^%s" % i)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("SVM with Varying C")
plt.savefig('images/ch-misinformation/ml_svm.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

0.25...0.5...0.75...0...1...2...

### Decision trees

In [26]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for md in [3, 6, 9, 12, 15, 18, 21, 24]:
    print("%s..." % md, end = '')
    clf = tree.DecisionTreeClassifier(max_depth = md)
    rec, pre = forward_chain(clf)
    ax1.plot([1, 2, 3, 4], rec, label = md)
    ax2.plot([1, 2, 3, 4], pre, label = md)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("Decision Trees of Varying Depths")
plt.savefig('images/ch-misinformation/ml_dt.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

3...6...9...12...15...18...21...24...

### RF

In [27]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for md in [3, 6, 9, 12, 15, 18, 21, 24]:
    print("%s..." % md, end = '')
    clf = RandomForestClassifier(max_depth = md)
    rec, pre = forward_chain(clf)
    ax1.plot([1, 2, 3, 4], rec, label = md)
    ax2.plot([1, 2, 3, 4], pre, label = md)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("Random Forest Classifier of Varying Depths")
plt.savefig('images/ch-misinformation/ml_rf.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

3...



6...



9...12...15...18...21...24...

### Neighbors

In [28]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for kn in [3, 6, 9, 12, 15, 18, 21, 24]:
    print("%s..." % kn, end = '')
    clf = KNeighborsClassifier(n_neighbors = kn)
    rec, pre = forward_chain(clf)
    ax1.plot([1, 2, 3, 4], rec, label = kn)
    ax2.plot([1, 2, 3, 4], pre, label = kn)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("K-Nearest Neighbors Classifier")
plt.savefig('images/ch-misinformation/ml_knn.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

3...6...9...12...15...18...21...24...

# Using word length

In [29]:
df_x['wordlength'] = df_x['text'].apply(lambda x: x.count(" "))

In [30]:
# Returns (recall, precision)
def rec_pre_wl(trn, tst, clf):
    X_trn = vectorizer.fit_transform(trn['token'])
    X_tst = vectorizer.transform(tst['token'])
    
    wl_trn = scaler.fit_transform(trn['text'].apply(lambda x: x.count(" ")).values.reshape(-1, 1))
    wl_tst = scaler.fit_transform(tst['text'].apply(lambda x: x.count(" ")).values.reshape(-1, 1))
    
    X_trn = hstack([X_trn, wl_trn])
    X_tst = hstack([X_tst, wl_tst])

    clf.fit(X_trn, trn['scam'])

    tn, fp, fn, tp = confusion_matrix(tst['scam'], clf.predict(X_tst)).ravel()
    return tp / (tp + fn), tp / (tp + fp)

### Logistic regression

In [31]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for c in [0.7, 0.85]:
    print("%s..." % c, end = '')
    clf = LogisticRegression(C = c)
    rec, pre = forward_chain(clf, rec_pre_wl)
    ax1.plot([1, 2, 3, 4], rec, label = c)
    ax2.plot([1, 2, 3, 4], pre, label = c)
    
for i in range(0, 5):
    c = 10**i
    print("%s..." % i, end = '')
    clf = LogisticRegression(C = c)
    rec, pre = forward_chain(clf, rec_pre_wl)
    ax1.plot([1, 2, 3, 4], rec, label = "10^%s" % i)
    ax2.plot([1, 2, 3, 4], pre, label = "10^%s" % i)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("Logistic Regression with Varying C ($L_2$ Regularization)\nUsing Tokens + # of Words")
plt.savefig('images/ch-misinformation/ml_log_wl.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

0.7...0.85...0...1...2...3...4...

### SVM

In [32]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for c in [0.25, 0.5, 0.75]:
    print("%s..." % c, end = '')
    clf = SVC(C = c)
    rec, pre = forward_chain(clf, rec_pre_wl)
    ax1.plot([1, 2, 3, 4], rec, label = c)
    ax2.plot([1, 2, 3, 4], pre, label = c)
    
for i in range(0, 3):
    c = 10**i
    print("%s..." % i, end = '')
    clf = SVC(C = c)
    rec, pre = forward_chain(clf, rec_pre_wl)
    ax1.plot([1, 2, 3, 4], rec, label = "10^%s" % i)
    ax2.plot([1, 2, 3, 4], pre, label = "10^%s" % i)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("SVM with Varying C\nUsing Tokens + # of Words")
plt.savefig('images/ch-misinformation/ml_svm_wl.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

0.25...0.5...0.75...0...1...2...

### Decision trees

In [33]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for md in [3, 6, 9, 12, 15, 18, 21, 24]:
    print("%s..." % md, end = '')
    clf = tree.DecisionTreeClassifier(max_depth = md)
    rec, pre = forward_chain(clf, rec_pre_wl)
    ax1.plot([1, 2, 3, 4], rec, label = md)
    ax2.plot([1, 2, 3, 4], pre, label = md)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("Decision Trees of Varying Depths\nUsing Tokens + # of Words")
plt.savefig('images/ch-misinformation/ml_dt_wl.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

3...6...9...12...15...18...21...24...

### Neighbors

In [34]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for kn in [3, 6, 9, 12, 15, 18, 21, 24]:
    print("%s..." % kn, end = '')
    clf = KNeighborsClassifier(n_neighbors = kn)
    rec, pre = forward_chain(clf, rec_pre_wl)
    ax1.plot([1, 2, 3, 4], rec, label = kn)
    ax2.plot([1, 2, 3, 4], pre, label = kn)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("K-Nearest Neighbors Classifier\nUsing Tokens + # of Words")
plt.savefig('images/ch-misinformation/ml_knn_wl.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

3...6...9...12...15...18...21...24...

# Using user country code

In [35]:
# Returns (recall, precision)
def rec_pre_cc(trn, tst, clf):
    X_trn = vectorizer.fit_transform(trn['token'])
    X_tst = vectorizer.transform(tst['token'])
    
    co_trn = trn['tel'].str.startswith('+57 ').values.reshape(-1, 1)
    vz_trn = trn['tel'].str.startswith('+58 ').values.reshape(-1, 1)
    
    co_tst = tst['tel'].str.startswith('+57 ').values.reshape(-1, 1)
    vz_tst = tst['tel'].str.startswith('+58 ').values.reshape(-1, 1)
    
    X_trn = hstack([X_trn, co_trn, vz_trn])
    X_tst = hstack([X_tst, co_tst, vz_tst])

    clf.fit(X_trn, trn['scam'])

    tn, fp, fn, tp = confusion_matrix(tst['scam'], clf.predict(X_tst)).ravel()
    return tp / (tp + fn), tp / (tp + fp)

### Logistic regression

In [36]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for c in [0.7, 0.85]:
    print("%s..." % c, end = '')
    clf = LogisticRegression(C = c)
    rec, pre = forward_chain(clf, rec_pre_cc)
    ax1.plot([1, 2, 3, 4], rec, label = c)
    ax2.plot([1, 2, 3, 4], pre, label = c)
    
for i in range(0, 5):
    c = 10**i
    print("%s..." % i, end = '')
    clf = LogisticRegression(C = c)
    rec, pre = forward_chain(clf, rec_pre_cc)
    ax1.plot([1, 2, 3, 4], rec, label = "10^%s" % i)
    ax2.plot([1, 2, 3, 4], pre, label = "10^%s" % i)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("Logistic Regression with Varying C ($L_2$ Regularization)\nUsing Tokens + User Country Code")
plt.savefig('images/ch-misinformation/ml_log_cc.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

0.7...0.85...0...1...2...

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


3...

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


4...

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### SVM

In [37]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for c in [0.25, 0.5, 0.75]:
    print("%s..." % c, end = '')
    clf = SVC(C = c)
    rec, pre = forward_chain(clf, rec_pre_cc)
    ax1.plot([1, 2, 3, 4], rec, label = c)
    ax2.plot([1, 2, 3, 4], pre, label = c)
    
for i in range(0, 3):
    c = 10**i
    print("%s..." % i, end = '')
    clf = SVC(C = c)
    rec, pre = forward_chain(clf, rec_pre_cc)
    ax1.plot([1, 2, 3, 4], rec, label = "10^%s" % i)
    ax2.plot([1, 2, 3, 4], pre, label = "10^%s" % i)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("SVM with Varying C\nUsing Tokens + User Country Code")
plt.savefig('images/ch-misinformation/ml_svm_cc.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

0.25...0.5...0.75...0...1...2...

### Decision trees

In [38]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for md in [3, 6, 9, 12, 15, 18, 21, 24]:
    print("%s..." % md, end = '')
    clf = tree.DecisionTreeClassifier(max_depth = md)
    rec, pre = forward_chain(clf, rec_pre_cc)
    ax1.plot([1, 2, 3, 4], rec, label = md)
    ax2.plot([1, 2, 3, 4], pre, label = md)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("Decision Trees of Varying Depths\nUsing Tokens + User Country Code")
plt.savefig('images/ch-misinformation/ml_dt_cc.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

3...6...9...12...15...18...21...24...

### Neighbors

In [39]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for kn in [3, 6, 9, 12, 15, 18, 21, 24]:
    print("%s..." % kn, end = '')
    clf = KNeighborsClassifier(n_neighbors = kn)
    rec, pre = forward_chain(clf, rec_pre_cc)
    ax1.plot([1, 2, 3, 4], rec, label = kn)
    ax2.plot([1, 2, 3, 4], pre, label = kn)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("K-Nearest Neighbors Classifier\nUsing Tokens + User Country Code")
plt.savefig('images/ch-misinformation/ml_knn_cc.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

3...6...9...12...15...18...21...24...

# Using group information

In [40]:
df_groups = pd.read_csv("Data/df_groups_nb3b-virality.csv", index_col = 0)
print(df_groups.shape)

(174, 66)


In [41]:
group_hhConc = df_groups['hhConc'].to_dict()
group_gini = df_groups['gini'].to_dict()

In [42]:
df_train['hhConc'] = df_train['uid'].apply(lambda x: group_hhConc[x])
df_train['gini'] = df_train['uid'].apply(lambda x: group_gini[x])

df_test['hhConc'] = df_test['uid'].apply(lambda x: group_hhConc[x])
df_test['gini'] = df_test['uid'].apply(lambda x: group_gini[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in t

In [43]:
# Returns (recall, precision)
def rec_pre_group(trn, tst, clf):
    X_trn = vectorizer.fit_transform(trn['token'])
    X_tst = vectorizer.transform(tst['token'])
    
    conc_trn = trn['hhConc'].values.reshape(-1, 1)
    gini_trn = trn['gini'].values.reshape(-1, 1)
    
    conc_tst = tst['hhConc'].values.reshape(-1, 1)
    gini_tst = tst['gini'].values.reshape(-1, 1)
    
    X_trn = hstack([X_trn, conc_trn, gini_trn])
    X_tst = hstack([X_tst, conc_tst, gini_tst])

    clf.fit(X_trn, trn['scam'])

    tn, fp, fn, tp = confusion_matrix(tst['scam'], clf.predict(X_tst)).ravel()
    return tp / (tp + fn), tp / (tp + fp)

### Logistic regression

In [44]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for c in [0.7, 0.85]:
    print("%s..." % c, end = '')
    clf = LogisticRegression(C = c)
    rec, pre = forward_chain(clf, rec_pre_group)
    ax1.plot([1, 2, 3, 4], rec, label = c)
    ax2.plot([1, 2, 3, 4], pre, label = c)
    
for i in range(0, 5):
    c = 10**i
    print("%s..." % i, end = '')
    clf = LogisticRegression(C = c)
    rec, pre = forward_chain(clf, rec_pre_group)
    ax1.plot([1, 2, 3, 4], rec, label = "10^%s" % i)
    ax2.plot([1, 2, 3, 4], pre, label = "10^%s" % i)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("Logistic Regression with Varying C ($L_2$ Regularization)\nUsing Tokens + Group Concentration, Inequality")
plt.savefig('images/ch-misinformation/ml_log_group.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

0.7...0.85...0...1...2...3...4...

### SVM

In [45]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for c in [0.25, 0.5, 0.75]:
    print("%s..." % c, end = '')
    clf = SVC(C = c)
    rec, pre = forward_chain(clf, rec_pre_group)
    ax1.plot([1, 2, 3, 4], rec, label = c)
    ax2.plot([1, 2, 3, 4], pre, label = c)
    
for i in range(0, 3):
    c = 10**i
    print("%s..." % i, end = '')
    clf = SVC(C = c)
    rec, pre = forward_chain(clf, rec_pre_group)
    ax1.plot([1, 2, 3, 4], rec, label = "10^%s" % i)
    ax2.plot([1, 2, 3, 4], pre, label = "10^%s" % i)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("SVM with Varying C\nUsing Tokens + Group Concentration, Inequality")
plt.savefig('images/ch-misinformation/ml_svm_group.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

0.25...0.5...0.75...0...1...2...

### Decision trees

In [46]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for md in [3, 6, 9, 12, 15, 18, 21, 24]:
    print("%s..." % md, end = '')
    clf = tree.DecisionTreeClassifier(max_depth = md)
    rec, pre = forward_chain(clf, rec_pre_group)
    ax1.plot([1, 2, 3, 4], rec, label = md)
    ax2.plot([1, 2, 3, 4], pre, label = md)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("Decision Trees of Varying Depths\nUsing Tokens + Group Concentration, Inequality")
plt.savefig('images/ch-misinformation/ml_dt_group.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

3...6...9...12...15...18...21...24...

### Neighbors

In [47]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 8))

for kn in [3, 6, 9, 12, 15, 18, 21, 24]:
    print("%s..." % kn, end = '')
    clf = KNeighborsClassifier(n_neighbors = kn)
    rec, pre = forward_chain(clf, rec_pre_group)
    ax1.plot([1, 2, 3, 4], rec, label = kn)
    ax2.plot([1, 2, 3, 4], pre, label = kn)
    
ax1.legend()
ax2.legend()

ax1.set_xlabel("Training Size / Validation Size")
ax2.set_xlabel("Training Size / Validation Size")

ax1.set_ylabel("Recall (Detected / Actual Scams)")
ax2.set_ylabel("Precision (Actual / Flagged Scams)")

plt.suptitle("K-Nearest Neighbors Classifier\nUsing Tokens + Group Concentration, Inequality")
plt.savefig('images/ch-misinformation/ml_knn_group.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

3...6...9...12...15...18...21...24...

# Test

### Nearest neighbors

In [48]:
X_train = vectorizer.fit_transform(df_train['token'])
X_test = vectorizer.transform(df_test['token'])

In [49]:
clf = KNeighborsClassifier(n_neighbors = 3)
clf.fit(X_train, df_train['scam'])

print(confusion_matrix(df_test['scam'], clf.predict(X_test)))

[[8637   11]
 [  49  103]]


### Decision tree

In [50]:
wl_train = scaler.fit_transform(df_train['text'].apply(lambda x: x.count(" ")).values.reshape(-1, 1))
wl_test = scaler.fit_transform(df_test['text'].apply(lambda x: x.count(" ")).values.reshape(-1, 1))

co_train = df_train['tel'].str.startswith('+57 ').values.reshape(-1, 1)
vz_train = df_train['tel'].str.startswith('+58 ').values.reshape(-1, 1)

co_test = df_test['tel'].str.startswith('+57 ').values.reshape(-1, 1)
vz_test = df_test['tel'].str.startswith('+58 ').values.reshape(-1, 1)

X_train_stack = hstack([X_train, co_train, vz_train, wl_train])
X_test_stack = hstack([X_test, co_test, vz_test, wl_test])

clf = tree.DecisionTreeClassifier(max_depth = 12)
clf.fit(X_train_stack, df_train['scam'])

print(confusion_matrix(df_test['scam'], clf.predict(X_test_stack)))

[[8641    7]
 [  94   58]]


In [51]:
wl_train = scaler.fit_transform(df_train['text'].apply(lambda x: x.count(" ")).values.reshape(-1, 1))
wl_test = scaler.fit_transform(df_test['text'].apply(lambda x: x.count(" ")).values.reshape(-1, 1))

co_train = df_train['tel'].str.startswith('+57 ').values.reshape(-1, 1)
vz_train = df_train['tel'].str.startswith('+58 ').values.reshape(-1, 1)

co_test = df_test['tel'].str.startswith('+57 ').values.reshape(-1, 1)
vz_test = df_test['tel'].str.startswith('+58 ').values.reshape(-1, 1)

X_train_stack = hstack([X_train, co_train, vz_train, wl_train])
X_test_stack = hstack([X_test, co_test, vz_test, wl_test])

clf = tree.DecisionTreeClassifier(max_depth = 3)
clf.fit(X_train_stack, df_train['scam'])

print(confusion_matrix(df_test['scam'], clf.predict(X_test_stack)))

[[8638   10]
 [ 114   38]]


In [52]:
from sklearn.tree import export_text
from sklearn.tree import export_graphviz

In [53]:
tree_rules = export_text(clf)
print(tree_rules)

|--- feature_31624 <= 0.08
|   |--- feature_34343 <= 0.24
|   |   |--- feature_15932 <= 0.11
|   |   |   |--- class: False
|   |   |--- feature_15932 >  0.11
|   |   |   |--- class: True
|   |--- feature_34343 >  0.24
|   |   |--- feature_42906 <= -0.18
|   |   |   |--- class: False
|   |   |--- feature_42906 >  -0.18
|   |   |   |--- class: True
|--- feature_31624 >  0.08
|   |--- feature_10757 <= 0.05
|   |   |--- feature_9357 <= 0.06
|   |   |   |--- class: True
|   |   |--- feature_9357 >  0.06
|   |   |   |--- class: False
|   |--- feature_10757 >  0.05
|   |   |--- class: False



In [54]:
token_names = vectorizer.get_feature_names() + ['CO User', 'VZ User', '# Words']

In [55]:
dotfile = open("images/ch-misinformation/tree.dot", 'w')
export_graphviz(clf, out_file=dotfile, feature_names=token_names)
dotfile.close()