# INF554 Kaggle challenge notebook
Team: Kagglers

Members: Marian Huot, Antoine Gleisberg, Aude Bouillé

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Tuple

import seaborn as sns

from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
from verstack.stratified_continuous_split import scsplit

In [24]:
from utils import clean_data

In [25]:
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [41]:
import csv

In [42]:
# Load the training data
train_data = pd.read_csv("data/train.csv")

# Load the evaluation data
eval_data = pd.read_csv("data/evaluation.csv")

In [59]:
# Average word count per tweet
print(np.mean(np.array([train_data["text"][i].count(' ') for i in range(len(train_data["text"]))])))

8.252581440747637


In [28]:
# remove outliers with retweets_count above n
def remove_outliers_above(df, col, n):
    df = df[df[col] < n]
    return df

def percentage_outliers_above(df, col, n):
    z = df[col].copy()
    z[z < n] = 0
    print(f"n = {n} ; Percentage of outliers : {round(len(z[z > 0]) / len(z) * 100, 4)}%")

In [29]:
percentage_outliers_above(train_data, 'retweets_count', 7000)

n = 7000 ; Percentage of outliers : 0.0155%


In [30]:
columns = ['TweetID', 'mention', 'urls', 'timestamp', 'text', 'total_text', 'hashtags', 'followers_count', 'friends_count', 'favorites_count', 'statuses_count', 'verified', 'url_count', 'followers_friends',
       'hour', 'day','week_in_month', 'polarity', 'subjectivity', 'hashtags_count', 'topic_1','topic_2', 'topic_3','topic_4', 'topic_5','cluster']

useless_cols = ["TweetID", "mention", "urls", "timestamp", "text", "hashtags", "total_text"]

In [37]:
pipelines: List[Tuple[str, bool, Tuple[float, float], Pipeline]] = []
#pipelines.append(('ScaledKNN',      False, (0, 0), Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor(5))])))
#pipelines.append(('ScaledRF',       False, (0, 0), Pipeline([('Scaler', StandardScaler()),('RF', RandomForestRegressor(n_estimators=10))])))
pipelines.append(('UnscaledRF',     False, (0, 0), Pipeline([('RF', RandomForestRegressor(n_estimators=10))])))


# pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
# pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
# pipelines.append(('OptimizedRF', Pipeline([('RF', RandomForestRegressor(max_features=0.93, max_leaf_nodes=2310, n_estimators=90, n_jobs=-1))])))
# pipelines.append(('OptimizedXGB', Pipeline([('XGB', XGBRegressor(base_score=0.5, booster='gbtree'))])))


In [32]:
def write_predictions_to_file(file_name, eval_data, predictions):
    with open(file_name, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["TweetID", "retweets_count"])
        for index, prediction in enumerate(predictions):
            writer.writerow([str(eval_data['TweetID'].iloc[index]) , str(round(prediction))])

In [38]:
for n in range(5000, 10001, 250):

    print(n)

    # split data using stratified continuous split
    X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweets_count'], stratify=train_data['retweets_count'], train_size=0.8, test_size=0.2)

    X_train['retweets_count'] = y_train
    # remove outliers according to visualization
    X_train = remove_outliers_above(X_train, 'retweets_count', n)
    # split again
    y_train = X_train['retweets_count']
    #X_train = X_train.drop('retweets_count',axis=1)

    # We remove the actual number of retweets from our features since it is the value that we are trying to predict
    X_train = X_train.drop(['retweets_count'], axis=1)
    X_test = X_test.drop(['retweets_count'], axis=1)
    
    ids = X_test["TweetID"]
    
    new_train_data = clean_data(X_train, columns, useless_cols)
    new_test_data = clean_data(X_test, columns, useless_cols)
    
    pipelines: List[Tuple[str, bool, Tuple[float, float], Pipeline]] = []
    #pipelines.append(('ScaledKNN',      False, (0, 0), Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor(5))])))
    #pipelines.append(('ScaledRF',       False, (0, 0), Pipeline([('Scaler', StandardScaler()),('RF', RandomForestRegressor(n_estimators=10))])))
    pipelines.append(('UnscaledRF',     False, (0, 0), Pipeline([('RF', RandomForestRegressor(n_estimators=10))])))

    for i, (name, was_evaluated, (mean, std), model) in enumerate(pipelines):
        if not was_evaluated:
            kfold = KFold(n_splits=10)
            cv_results = cross_val_score(model, new_train_data, y_train, cv=kfold, scoring='neg_mean_absolute_error')
            pipelines[i] = (name, True, (cv_results.mean(), cv_results.std()), model)
        print(f"{name}: {pipelines[i][2][0]} ({pipelines[i][2][1]})")

    #La cross validation nous donne deja les perfs de chaque modèle
    for name, was_evaluated, (mean, std), model in pipelines:
        model.fit(new_train_data, y_train)
        predictions = model.predict(new_test_data)
        predictions = [round(value) if value >= 0 else 0 for value in predictions]
        print(f"{name} done.\nMean absolute error: {mean_absolute_error(y_test, predictions)}")
        y_test["TweetID"] = ids
        write_predictions_to_file("result" + str(n) + name + ".csv", y_test, predictions)

5000
UnscaledRF: -5.397619111800038 (0.289856156052332)
UnscaledRF done.
Mean absolute error: 7.250388450998672
5250
UnscaledRF: -5.448423444323973 (0.20496298208550745)
UnscaledRF done.
Mean absolute error: 7.205667146933356
5500
UnscaledRF: -5.452374286232218 (0.19912195600958824)
UnscaledRF done.
Mean absolute error: 7.65833262705879
5750
UnscaledRF: -5.490453121872535 (0.29667372970190997)
UnscaledRF done.
Mean absolute error: 6.855552730457384
6000
UnscaledRF: -5.513578370659965 (0.3098757356424935)
UnscaledRF done.
Mean absolute error: 7.238551289657315
6250
UnscaledRF: -5.6651571927386986 (0.3486339339558479)
UnscaledRF done.
Mean absolute error: 6.779091448427833
6500
UnscaledRF: -5.636866110371128 (0.2969508885341222)
UnscaledRF done.
Mean absolute error: 6.636960759386389
6750
UnscaledRF: -5.7675200746096 (0.2033255064152532)
UnscaledRF done.
Mean absolute error: 7.389143147724384
7000
UnscaledRF: -5.7184167262609344 (0.17708091470790482)
UnscaledRF done.
Mean absolute error:

In [None]:
'''5000
UnscaledRF: -5.397619111800038 (0.289856156052332)
UnscaledRF done.
Mean absolute error: 7.250388450998672
5250
UnscaledRF: -5.448423444323973 (0.20496298208550745)
UnscaledRF done.
Mean absolute error: 7.205667146933356
5500
UnscaledRF: -5.452374286232218 (0.19912195600958824)
UnscaledRF done.
Mean absolute error: 7.65833262705879
5750
UnscaledRF: -5.490453121872535 (0.29667372970190997)
UnscaledRF done.
Mean absolute error: 6.855552730457384
6000
UnscaledRF: -5.513578370659965 (0.3098757356424935)
UnscaledRF done.
Mean absolute error: 7.238551289657315
6250
UnscaledRF: -5.6651571927386986 (0.3486339339558479)
UnscaledRF done.
Mean absolute error: 6.779091448427833
6500
UnscaledRF: -5.636866110371128 (0.2969508885341222)
UnscaledRF done.
Mean absolute error: 6.636960759386389
6750
UnscaledRF: -5.7675200746096 (0.2033255064152532)
UnscaledRF done.
Mean absolute error: 7.389143147724384
7000
UnscaledRF: -5.7184167262609344 (0.17708091470790482)
UnscaledRF done.
Mean absolute error: 7.031556346583044
7250
UnscaledRF: -5.803783702124602 (0.42056845262876075)
UnscaledRF done.
Mean absolute error: 6.732138316806509
7500
UnscaledRF: -5.827816174086125 (0.4009256530399997)
UnscaledRF done.
Mean absolute error: 6.900556544339916
7750
UnscaledRF: -5.777666667273329 (0.3242835595099051)
UnscaledRF done.
Mean absolute error: 7.1360849789530185
8000
UnscaledRF: -5.9961866794291225 (0.2870194877498937)
UnscaledRF done.
Mean absolute error: 6.84656891826991
8250
UnscaledRF: -6.0036469400297925 (0.359911572597961)
UnscaledRF done.
Mean absolute error: 6.89468034014182
8500
UnscaledRF: -5.993208859687341 (0.2666622237075625)
UnscaledRF done.
Mean absolute error: 6.950560781987174
8750
UnscaledRF: -6.045329019546459 (0.37466794818602595)
UnscaledRF done.
Mean absolute error: 6.49824843913326
9000
UnscaledRF: -6.0359102598466565 (0.41671869903898096)
UnscaledRF done.
Mean absolute error: 6.8137695284911155
9250
UnscaledRF: -5.968808040382038 (0.2719186893768838)
UnscaledRF done.
Mean absolute error: 6.54496143740995
9500
UnscaledRF: -6.04767550590025 (0.37415321890343517)
UnscaledRF done.
Mean absolute error: 6.819391473853717
9750
UnscaledRF: -6.1820427335334625 (0.3859783876656396)
UnscaledRF done.
Mean absolute error: 6.661878125264853
10000
UnscaledRF: -6.01128750100476 (0.48581275488766285)
UnscaledRF done.
Mean absolute error: 6.807610814475803'''

In [39]:
for n in range(5000, 10001, 250):

    print(n)

    # split data using stratified continuous split
    X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweets_count'], stratify=train_data['retweets_count'], train_size=0.8, test_size=0.2)

    X_train['retweets_count'] = y_train
    # remove outliers according to visualization
    X_train = remove_outliers_above(X_train, 'retweets_count', n)
    # split again
    y_train = X_train['retweets_count']
    #X_train = X_train.drop('retweets_count',axis=1)

    # We remove the actual number of retweets from our features since it is the value that we are trying to predict
    X_train = X_train.drop(['retweets_count'], axis=1)
    X_test = X_test.drop(['retweets_count'], axis=1)
    
    ids = X_test["TweetID"]
    
    new_train_data = clean_data(X_train, columns, useless_cols)
    new_test_data = clean_data(X_test, columns, useless_cols)
    
    pipelines: List[Tuple[str, bool, Tuple[float, float], Pipeline]] = []
    pipelines.append(('ScaledKNN',      False, (0, 0), Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor(5))])))
    #pipelines.append(('ScaledRF',       False, (0, 0), Pipeline([('Scaler', StandardScaler()),('RF', RandomForestRegressor(n_estimators=10))])))
    #pipelines.append(('UnscaledRF',     False, (0, 0), Pipeline([('RF', RandomForestRegressor(n_estimators=10))])))

    for i, (name, was_evaluated, (mean, std), model) in enumerate(pipelines):
        if not was_evaluated:
            kfold = KFold(n_splits=10)
            cv_results = cross_val_score(model, new_train_data, y_train, cv=kfold, scoring='neg_mean_absolute_error')
            pipelines[i] = (name, True, (cv_results.mean(), cv_results.std()), model)
        print(f"{name}: {pipelines[i][2][0]} ({pipelines[i][2][1]})")

    #La cross validation nous donne deja les perfs de chaque modèle
    for name, was_evaluated, (mean, std), model in pipelines:
        model.fit(new_train_data, y_train)
        predictions = model.predict(new_test_data)
        predictions = [round(value) if value >= 0 else 0 for value in predictions]
        print(f"{name} done.\nMean absolute error: {mean_absolute_error(y_test, predictions)}")
        y_test["TweetID"] = ids
        write_predictions_to_file("result" + str(n) + name + ".csv", y_test, predictions)

5000
ScaledKNN: -7.894287009114071 (0.3450909842333503)
ScaledKNN done.
Mean absolute error: 9.339463796366923
5250
ScaledKNN: -7.9791470557487205 (0.37644014608130455)
ScaledKNN done.
Mean absolute error: 9.563818967709128
5500
ScaledKNN: -8.093326449306732 (0.4496690576257772)
ScaledKNN done.
Mean absolute error: 9.815280956013222
5750
ScaledKNN: -8.168020601944681 (0.3120656475183672)
ScaledKNN done.
Mean absolute error: 9.49057829759584
6000
ScaledKNN: -8.276324046103232 (0.2635531132901475)
ScaledKNN done.
Mean absolute error: 9.443879424810012
6250
ScaledKNN: -8.416929714360524 (0.21574769657913068)
ScaledKNN done.
Mean absolute error: 10.159420289855072
6500
ScaledKNN: -8.341485117416562 (0.32338256711440094)
ScaledKNN done.
Mean absolute error: 9.458386303924062
6750
ScaledKNN: -8.609397567799274 (0.3272407252791545)
ScaledKNN done.
Mean absolute error: 9.865878464276634
7000
ScaledKNN: -8.557966683274076 (0.26358452519277437)
ScaledKNN done.
Mean absolute error: 9.459855354973

In [None]:
'''
5000
ScaledKNN: -7.894287009114071 (0.3450909842333503)
ScaledKNN done.
Mean absolute error: 9.339463796366923
5250
ScaledKNN: -7.9791470557487205 (0.37644014608130455)
ScaledKNN done.
Mean absolute error: 9.563818967709128
5500
ScaledKNN: -8.093326449306732 (0.4496690576257772)
ScaledKNN done.
Mean absolute error: 9.815280956013222
5750
ScaledKNN: -8.168020601944681 (0.3120656475183672)
ScaledKNN done.
Mean absolute error: 9.49057829759584
6000
ScaledKNN: -8.276324046103232 (0.2635531132901475)
ScaledKNN done.
Mean absolute error: 9.443879424810012
6250
ScaledKNN: -8.416929714360524 (0.21574769657913068)
ScaledKNN done.
Mean absolute error: 10.159420289855072
6500
ScaledKNN: -8.341485117416562 (0.32338256711440094)
ScaledKNN done.
Mean absolute error: 9.458386303924062
6750
ScaledKNN: -8.609397567799274 (0.3272407252791545)
ScaledKNN done.
Mean absolute error: 9.865878464276634
7000
ScaledKNN: -8.557966683274076 (0.26358452519277437)
ScaledKNN done.
Mean absolute error: 9.459855354973586
7250
ScaledKNN: -8.706649846273757 (0.3092808655535725)
ScaledKNN done.
Mean absolute error: 9.723620645817443
7500
ScaledKNN: -8.821736749583033 (0.3255753348757769)
ScaledKNN done.
Mean absolute error: 9.57663078791988
7750
ScaledKNN: -8.84258876252095 (0.38725615045695283)
ScaledKNN done.
Mean absolute error: 9.54897307681442
8000
ScaledKNN: -8.939930837164232 (0.38227086718514885)
ScaledKNN done.
Mean absolute error: 9.743269203604825
8250
ScaledKNN: -8.972601649639518 (0.3455014977246472)
ScaledKNN done.
Mean absolute error: 9.739977964234257
8500
ScaledKNN: -8.89523998035651 (0.322745322638089)
ScaledKNN done.
Mean absolute error: 10.12790631974461
8750
ScaledKNN: -9.04791708050973 (0.46818573663030916)
ScaledKNN done.
Mean absolute error: 9.359281860044636
9000
ScaledKNN: -9.069125251195596 (0.32251208583782914)
ScaledKNN done.
Mean absolute error: 9.814814814814815
9250
ScaledKNN: -9.1081402632278 (0.4154507979759329)
ScaledKNN done.
Mean absolute error: 9.573212419131565
9500
ScaledKNN: -9.06810171990799 (0.48794427008242136)
ScaledKNN done.
Mean absolute error: 10.597875526174535
9750
ScaledKNN: -9.196238742715874 (0.47392663619712594)
ScaledKNN done.
Mean absolute error: 9.770489024493601
10000
ScaledKNN: -9.130936877339785 (0.5337880460690733)
ScaledKNN done.
Mean absolute error: 10.022346526541797'''