# Notebook magique with AutoML

TO DO:
- fix hashtags
- fix log
- remove outliers

In [34]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sns as sns
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack


# Load the training data
train_data = pd.read_csv("data/train.csv")

# Load the evaluation data
eval_data = pd.read_csv("data/evaluation.csv")





Split data

In [35]:
#split data
X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweets_count'], stratify=train_data['retweets_count'], train_size=0.8, test_size=0.2)
# We remove the actual number of retweets from our features since it is the value that we are trying to predict
X_train = X_train.drop(['retweets_count'], axis=1)
X_test = X_test.drop(['retweets_count'], axis=1)

## Outliers

In [36]:
# remove outliers with quartile
def remove_outliers(df, col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    df = df[(df[col] >= q1 - 5 * iqr) & (df[col] <= q3 + 5 * iqr)]
    return df


In [37]:
def feature_engineering(in_df):
    rs_df = in_df

    #mentions is always empty -> DROP Mention
    rs_df = rs_df.drop(['mentions'], axis=1)

    #add a column to data which counts url
    rs_df['url_count'] = rs_df['urls'].str.count('http')
    rs_df= rs_df.drop(['urls'], axis=1)
    #add a column to data which gives length of hashtags
    #TO DO
    #add columns to data which makes ratios
    rs_df['followers_friends'] = rs_df['followers_count'] / (rs_df['friends_count'].apply(lambda x: x+1))
    rs_df['favorites_followers'] = rs_df['favorites_count'] / (rs_df['followers_count'].apply(lambda x: x+1))
    rs_df['favorites_friends'] = rs_df['favorites_count'] / (rs_df['friends_count'].apply(lambda x: x+1))

    return rs_df



In [38]:
from datetime import datetime
def time_engineering(in_df):
    rs_df = in_df
    rs_df["hour"] = rs_df.timestamp.apply(lambda t: (datetime.fromtimestamp(t//1000)).hour)
    rs_df["day"] = rs_df.timestamp.apply(lambda t: (datetime.fromtimestamp(t//1000)).weekday())
    rs_df.drop(['timestamp'], axis=1)
    return rs_df

In [39]:
from textblob import TextBlob #pip install textblob-fr
from textblob_fr import PatternTagger, PatternAnalyzer
def text_engineering(in_df):
    rs_df = in_df
    #add columns related to sentiment analysis
    rs_df['polarity']=rs_df['text'].apply(lambda x: TextBlob(x, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()).sentiment[0])
    rs_df['subjectivity']=rs_df['text'].apply(lambda x: TextBlob(x, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()).sentiment[1])
    #drop the text column
    rs_df = rs_df.drop(['text'], axis=1)
    return rs_df

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
def hashtags_engineering(in_df):
    rs_df = in_df
    #make a string of the list
    rs_df['hashtags']=rs_df['hashtags'].apply(lambda x: x.replace('[','').replace(']','').replace("'",''))
    vectorizer2 = CountVectorizer(max_features=14, stop_words=stopwords.words('french')) 
    hashtags = vectorizer2.fit_transform(rs_df['hashtags'])
    # show vocabulary
    pop=np.sum(hashtags.toarray(),axis=1)
    #pop to df
    pop=pd.DataFrame(pop,columns=['pop'])# popularity=number of popular hashtags
    #add pop to df
    rs_df=pd.concat([rs_df,pop],axis=1)
    #drop hashtags
    rs_df = rs_df.drop(['hashtags'], axis=1)
    return rs_df


## Scaler

In [41]:
def logtransformdrop(dataframe, cols):
    for col in cols:
        dataframe['log_' + col ] = dataframe[col].apply(lambda x: np.log10(int(x)+1))
    dataframe = dataframe.drop(cols,axis=1)
    return dataframe

## Data processing

In [42]:
new_X_train=feature_engineering(X_train)
new_X_train=time_engineering(new_X_train)
new_X_train=text_engineering(new_X_train)
#new_X_train=hashtags_engineering(new_X_train)
#new_X_train=logtransformdrop(X_train,['followers.count','friends_count','favorites_count','followers_count'])

## Flaml

In [43]:
from flaml import AutoML
automl = AutoML()

settings = {
    "time_budget": 1800,  # total running time in seconds
    "metric": 'mae',  # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
                           # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'
    "task": 'regression',  # task type
    'estimator_list':['rf','extra_tree'],   #['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
    "seed": 7654321,    # random seed
}

automl.fit(new_X_train.drop(['TweetID'], axis=1), y_train, **settings)
# Print the best model
print(automl.model.estimator)

[flaml.automl: 11-21 19:05:56] {2600} INFO - task = regression
[flaml.automl: 11-21 19:05:56] {2602} INFO - Data split method: uniform
[flaml.automl: 11-21 19:05:56] {2605} INFO - Evaluation method: holdout
[flaml.automl: 11-21 19:05:56] {2727} INFO - Minimizing error metric: mae
[flaml.automl: 11-21 19:05:56] {2869} INFO - List of ML learners in AutoML Run: ['rf', 'extra_tree']
[flaml.automl: 11-21 19:05:56] {3164} INFO - iteration 0, current learner rf
[flaml.automl: 11-21 19:05:56] {3297} INFO - Estimated sufficient time budget=20450s. Estimated necessary time budget=20s.
[flaml.automl: 11-21 19:05:56] {3344} INFO -  at 1.2s,	estimator rf's best error=11.1546,	best estimator rf's best error=11.1546
[flaml.automl: 11-21 19:05:56] {3164} INFO - iteration 1, current learner extra_tree
[flaml.automl: 11-21 19:05:56] {3344} INFO -  at 1.3s,	estimator extra_tree's best error=13.9062,	best estimator rf's best error=11.1546
[flaml.automl: 11-21 19:05:56] {3164} INFO - iteration 2, current l


AutoML.fit(self, X_train, y_train, dataframe, label, metric, task, n_jobs, log_file_name, estimator_list, time_budget, max_iter, sample, ensemble, eval_method, log_type, model_history, split_ratio, n_splits, log_training_metric, mem_thres, pred_time_limit, train_time_limit, X_val, y_val, sample_weight_val, groups_val, groups, verbose, retrain_full, split_type, learner_selector, hpo_method, starting_points, seed, n_concurrent_trials, keep_search_state, preserve_checkpoint, early_stop, append_log, auto_augment, min_sample_size, use_ray, metric_constraints, custom_hp, cv_score_agg_func, skip_transform, fit_kwargs_by_estimator, **fit_kwargs)


xgboost, rf, lgbm, extra_tree

# Submission

In [None]:
#test on X_test
y_pred = automl.predict(X_test.drop(['TweetID'], axis=1))
#mean absolute error
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_pred))

## Model

In [None]:
# Predict the CoverType for the evaluation dataset
#import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
#import ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesRegressor

model=automl.model

model.fit(X_train, y_train)

#show top features
importances = model.feature_importances_
print(importances)



[0.75952837 0.02096927 0.0268793  0.01763596 0.15505379 0.01993332]


In [None]:

pred = model.predict(eval_data.drop(['TweetID'], axis=1))
#round to integer
pred = np.round(pred)
# Dump the results into a csv file that follows the required Kaggle template
eval_data['retweets_count'] = pred
eval_data[["TweetID", "retweets_count"]].to_csv("data/submission.csv", index=False)