In [None]:
import pandas as pd
import math
import time
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  
import json
import numpy as np
import torch
import torch.nn as nn
import torchvision
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

## Load the data

In [None]:
data_eval = pd.read_csv("data/evaluation_process.csv")
data = pd.read_csv("data/training_process.csv")

## First part : Classification phase

### Transform data

First, we need to transform the pd object into numpy array.

In [None]:
def to_list(L) :
    # input "[1,2,565454,1]"
    # ouput [1,2,565454,1]
    T = []
    for k in L :
        if not k in [" ",",","[","]"] :
            try :
                x = x*10 + int(k)
            except :
                x = int(k)
        else : 
            try :
                if k != ' ' :
                    T.append(x)
                    x = 0
            except : 
                count = 0
    return T

def from_pd_to_numpy(database,training=True) :
    X = np.zeros((database.shape[0],21))
    y = np.zeros(database.shape[0])
    for k in range(database.shape[0]) :
        current = database.iloc[k]
        if training : 
            y[k] = current.retweet_count
        else :
            y[k] = current.id
        X[k,0] = current.user_verified
        X[k,1] = current.user_statuses_count
        X[k,2] = current.user_followers_count
        X[k,3] = current.user_friends_count
        X[k,4] = current.num_hashtag
        X[k,5] = current.got_hashtag
        X[k,6] = current.num_at
        X[k,7] = current.got_at
        X[k,8] = current.num_link
        X[k,9] = current.got_link
        X[k,10] = current.length
        X[k,11] = current.contains_rt
        X[k,12] = current.weak
        X[k,13] = current.strong
        X[k,14] = current.is_upper
        X[k,15] = current.contains_excl
        X[k,16] = current.contains_per
        X[k,17] = current.contains_org
        X[k,18] = current.contains_gpe
        X[k,19] = current.zeros_pic
        X[k,20] = current.sentiment
        """try : # if we read the database, it will be a string
            X[k,11:18] = to_list(current.day)
            X[k,18:42] = to_list(current.hour)
            X[k,42:54] = to_list(current.month)
        except :
            X[k,11:18] = current.day
            X[k,18:42] = current.hour
            X[k,42:54] = current.month"""
    
    return X,y   

X,y = from_pd_to_numpy(data)

### Classification

But now, we do not want yet to predict the number of retweets, but rather classify them. Therefore, we will transform *y* to define 6 classes. 0, <10, <100, <1000, <10000 and the rest.

In [None]:
def to_classify(y) :
    z = np.zeros(y.shape)
    for k in range(y.shape[0]) :
        f = y[k]
        if f > 0 :
            if f < 10 :
                z[k] = 1
            elif f < 100 :
                z[k] = 2
            elif f < 1000 :
                z[k] = 3
            elif f < 10000 :
                z[k] = 4
            else :
                z[k] = 5
    return z

classes = to_classify(y)

## Training dataset

X contains too much data around 0. We want to have as much tweet with class 0 than 5 in our training data.

In [None]:
def to_train(X,y,classes):
    max_class = np.amax(classes)
    count_class = {
        '0' : 0,
        '1' : 0,
        '2' : 0,
        '3' : 0,
        '4' : 0,
        '5' : 0,
        '6' : 0
    }
    count = 0
    for k in range(X.shape[0]) :
        if classes[k] == max_class :
            count +=1
    X_train = np.zeros(X.shape)
    y_train = np.zeros(y.shape)
    classes_train = np.zeros(classes.shape)
    indice = 0
    count = int(count*0.7) # do not take the whole data
    print(count)
    for k in range(X.shape[0]) : 
        current_class = str(int(classes[k]))
        if count_class[current_class] < count*(max_class+ 1 -int(current_class))**1.3 :
            X_train[indice,:] = X[k,:]
            y_train[indice] = y[k]
            classes_train[indice] = classes[k]
            count_class[current_class]+=1
            indice+=1
        
    
    return X_train[:indice,:],y_train[:indice],classes_train[:indice]

X_train, y_train, classes_train = to_train(X,y,classes)

### Same number of tweet per class

To be sure we have the exact same number of tweets per class, we used this SMOTE technique

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE() 
X_train,classes_train = oversample.fit_resample(X_train,classes_train)

## Classification via random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn import svm

clf = RandomForestClassifier(n_estimators=400,max_depth=30, random_state=1)
# clf = tree.DecisionTreeClassifier(max_depth=2)
# clf = KNeighborsClassifier(n_neighbors=3)
# clf = svm.SVC()

clf.fit(X_train, classes_train)

In [None]:
output = clf.predict(X)

In [None]:
def calculate_error(output,target) :
    error = 0
    for k in range(output.shape[0]) :
        if abs(int(output[k])- int(target[k])) != 0 :
            error += 1
    return error/output.shape[0]

def calculate_class_error(output,target):
    error=6*[0]
    pop=6*[0]
    for k in range(output.shape[0]) :
        pop[int(target[k])]+=1
        if abs(int(output[k])- int(target[k])) != 0 :
            error[int(target[k])] += 1
    for i in range(6):
        error[i]/=pop[i]
        print(i," : ",error[i]*100)
    
    
def noRT(out) :
    if out == 0 :
        return 0
    return 3*10**(out-1)

The error per class

In [None]:
calculate_class_error(output,classes)

## Prediction per class

For each classes, we will build a predictor.

In [None]:
number_classes = int(np.amax(classes) + 1)
classes_entr = clf.predict(X)
# classes_entr = model_cl.predict(X)

### We regroup each class in a dictionnary according to the classification by the first model. 

In [None]:
def create_dataset_per_class(X,y,classes) :
    dataset = {}
    for k in range(X.shape[0]) :
        f = int(classes[k])
        # f = np.argmax(classes[k])
        try : 
            dataset[f] +=1
        except :
            dataset[f] = 1
    dataset2 = {}
    for k in range(int(np.amax(classes)+1)) :
    # for k in range(classes.shape[1]):
        dataset2[str(k)] = {}
        dataset2[str(k)]["X"] = np.zeros((dataset[k],X.shape[1]))
        dataset2[str(k)]["y"] = np.zeros(dataset[k])
    
    indices = {
        '0' : 0,
        '1' : 0,
        '2' : 0,
        '3' : 0,
        '4' : 0,
        '5' : 0
    }
    for k in range(X.shape[0]) :
        f = str(int(classes[k]))
        # f = str(np.argmax(classes[k]))
        indice = indices[f]
        indices[f] +=1
        dataset2[f]["X"][indice,:] = X[k,:]
        dataset2[f]["y"][indice] = y[k]
    return dataset2


dataset_per_class = create_dataset_per_class(X,y,classes_entr)

In [None]:
def get_dataset_class(dataset,class_to_get) :
    return dataset[str(class_to_get)]["X"], dataset[str(class_to_get)]["y"]

## All the models will be alike

*d* contains the training set and the model associated with the class.
We tried different learning model, such as neural network, random forrest and gradient tree boosting. The first gave us the best results.

In [None]:
#Dependencies
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

def train_models(dataset_per_class) :
    for k in dataset_per_class :
        if int(k) != 0 : # we do not train the first class as it is only a 0 prediction
            X_test, y_test = get_dataset_class(dataset_per_class,int(k))
            # neural network
            print(X_test.shape,y_test.shape)
            
            Train,a,train,b = train_test_split(X_test,y_test,test_size = 0.5)
            model = Sequential()
            model.add(Dense(64, input_dim=21, activation="sigmoid"))
            model.add(Dense(128, activation="relu"))
            model.add(Dense(256, activation="relu"))
            model.add(Dense(32, activation="relu"))
            model.add(Dense(1))
            model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
            history = model.fit(Train, train,validation_data = (a,b), epochs=45, batch_size=64)
            plt.plot(history.history['loss'])
            plt.plot(history.history['val_loss'])
            plt.title('model loss')
            plt.ylabel('loss')
            plt.xlabel('epoch')
            plt.legend(['train', 'test'], loc='upper left')
            plt.show()
            # random forrest
            """model = RandomForestRegressor(n_estimators=200,max_depth=30, random_state=0)
            model.fit(Train,train)"""
            
            #Gradient tree bossting
            """model = GradientBoostingRegressor(n_estimators=400,max_depth=30,random_state=0)
            model.fit(Train,train)
            GradientBoostingRegressor"""
            
            dataset_per_class[str(int(k))]["model"] = model
    return dataset_per_class

d = train_models(dataset_per_class)

## Now we need to compute these two steps on our evaluation data

### Transform eval_data

In [None]:
X_eval,id_eval = from_pd_to_numpy(data_eval,training=False)

### Classify the data

In [None]:
classes_eval = clf.predict(X_eval)

### Class these data according to their class prediction

In [None]:
def create_dataset_eval_classes(X_eval,id_eval,eval_classes) :
    dataset = {}
    for k in range(X_eval.shape[0]) :
        try : 
            f = int(eval_classes[k]) # we have a prediction like 0 or 1 or 2
        except :
            f = np.argmax(eval_classes[k]) # we have something like [0.2,0.3,0.4,0.1]
        try : 
            dataset[f] +=1
        except :
            dataset[f] = 1
    dataset2 = {}
    for k in range(int(np.amax(eval_classes)+1)) :
    # for k in range(eval_classes.shape[1]) :
        dataset2[str(k)] = {}
        dataset2[str(k)]["X"] = np.zeros((dataset[k],X_eval.shape[1]))
        dataset2[str(k)]["id"] = np.zeros(dataset[k])
        dataset2[str(k)]["class"] = np.zeros(dataset[k])
    
    indices = {
        '0' : 0,
        '1' : 0,
        '2' : 0,
        '3' : 0,
        '4' : 0,
        '5' : 0
    }
    for k in range(X_eval.shape[0]) :
        try :
            f = str(int(eval_classes[k]))
        except :
            f = str(np.argmax(eval_classes[k])) # we have something like [0.2,0.3,0.4,0.1]
        indice = indices[f]
        indices[f] +=1
        dataset2[f]["X"][indice,:] = X_eval[k,:]
        dataset2[f]["id"][indice] = id_eval[k]
        dataset2[f]["class"][indice] = int(f)
    return dataset2

dataset_eval = create_dataset_eval_classes(X_eval,id_eval,classes_eval) 

### Use the previous model on each class

The prediction will be stored in *res*, which will then be ordered.

In [None]:
def evaluate_models(dataset_models,dataset_eval) :
    res = {}
    for indice in dataset_eval['0']["id"] :
        res[int(indice)] = 0
    for k in dataset_eval :
        if int(k) != 0 :
            X_pred = dataset_eval[k]["X"]
            prediction = dataset_models[k]["model"].predict(X_pred)
            for i in range(len(dataset_eval[k]["id"])) :
                res[int(dataset_eval[k]["id"][i])] = int(prediction[i])
    return res
    
res = evaluate_models(d,dataset_eval)

### We need to order the results by the tweet ID.

In [None]:
import collections

od = collections.OrderedDict(sorted(res.items()))

### Save the result

In [None]:
df = pd.DataFrame()
df["TweetID"] = od.keys()
df["NoRetweets"] = od.values()

In [None]:
df.to_csv("results/res21.csv",index=False)