In [1]:
from ptracking.topic.lda_tomoto import tomoto_load_model
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import SVC
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from ptracking.predict import Dataset
from ptracking.twitter_scraper.twitter_scraper import TwitterFetcher

skf = StratifiedKFold()

In [2]:
#use this to load the topic features already calculated (stored in topic/tomoto_topic.mdl - change it accordingly below) and their associated model
data, model = tomoto_load_model('C:/Users/mihut/Desktop/petition-tracking/ptracking/topic/tomoto_topic.mdl')

#use this if you want to manually create the model
#data, model = tomoto_topics(30, 200)

In [5]:
#compute any type of features you want (should be in a dataframe with petition_id as index), in this case twitter features
tweets = TwitterFetcher().select_tweet_count()
#join topic features and twitter features
features = data.join(tweets)

#collect other information from the table (no. of signatures, created date, class etc) and join it with the features
dataset = Dataset().prepare(columns=["created_at"]).join(features)
dataset.sort_values("created_at", inplace=True)
dataset = dataset.reset_index()

#only select columns containing the features we want from the table (in this case columns starting with the fourth column) as the feature vector
X = np.array(dataset.iloc[:,4:].values.tolist())
#label vector is always class column
y = np.array(dataset['class'].values.tolist())

In [6]:
#this piece of code should be the same for all models, currently working with stratified cross validation and using MCC as metric
xgb = XGBClassifier()
scores = list()

for train_index, test_index in skf.split(X, y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  xgb.fit(X_train, y_train)
  y_pred = xgb.predict(X_test)
  scores.append(round(matthews_corrcoef(y_test,y_pred),2))

print(scores)
print(round(np.mean(scores),2))

[0.59, 0.58, 0.59, 0.56, 0.5]
0.56


In [27]:
knn = KNeighborsClassifier(n_jobs=-1)
scores = list()

for train_index, test_index in skf.split(X, y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  knn.fit(X_train, y_train)
  y_pred = knn.predict(X_test)
  scores.append(round(matthews_corrcoef(y_test,y_pred),2))

print(scores)
print(round(np.mean(scores),2))

[0.52, 0.55, 0.53, 0.49, 0.47]
0.51


In [31]:
svc = SVC(class_weight='balanced')
scores = list()

for train_index, test_index in skf.split(X, y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  svc.fit(X_train, y_train)
  y_pred = svc.predict(X_test)
  scores.append(round(matthews_corrcoef(y_test,y_pred),2))

print(scores)
print(round(np.mean(scores),2))

[0.26, 0.27, 0.28, 0.24, 0.21]
0.25


In [29]:
dt = DecisionTreeClassifier()
scores = list()

for train_index, test_index in skf.split(X, y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  dt.fit(X_train, y_train)
  y_pred = dt.predict(X_test)
  scores.append(round(matthews_corrcoef(y_test,y_pred),2))

print(scores)
print(round(np.mean(scores),2))

[0.47, 0.45, 0.45, 0.42, 0.41]
0.44


In [30]:
rf = RandomForestClassifier()
scores = list()

for train_index, test_index in skf.split(X, y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  rf.fit(X_train, y_train)
  y_pred = rf.predict(X_test)
  scores.append(round(matthews_corrcoef(y_test,y_pred),2))

print(scores)
print(round(np.mean(scores),2))

[0.59, 0.59, 0.58, 0.52, 0.49]
0.55


In [34]:
mlp = MLPClassifier()
scores = list()

for train_index, test_index in skf.split(X, y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  mlp.fit(X_train, y_train)
  y_pred = mlp.predict(X_test)
  scores.append(round(matthews_corrcoef(y_test,y_pred),2))

print(scores)
print(round(np.mean(scores),2))

[0.54, 0.48, 0.56, 0.5, 0.52]
0.52


In [35]:
nb = ComplementNB()
scores = list()

for train_index, test_index in skf.split(X, y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  nb.fit(X_train, y_train)
  y_pred = nb.predict(X_test)
  scores.append(round(matthews_corrcoef(y_test,y_pred),2))

print(scores)
print(round(np.mean(scores),2))

[0.34, 0.32, 0.33, 0.3, 0.3]
0.32


In [8]:
data = TwitterFetcher.select('petition_id','created_at')

In [6]:
dataset = Dataset().prepare(columns=["signatures"])

Unnamed: 0_level_0,petition_id
tweet_id,Unnamed: 1_level_1
2473913,572973
2473914,572973
2473915,572973
2473916,572973
2473917,572973
...,...
2473908,572973
2473909,572973
2473910,572973
2473911,572973
