# Notebook magique with AutoML

In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sns as sns
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack


# Load the training data
train_data = pd.read_csv("data/train.csv")

# Load the evaluation data
eval_data = pd.read_csv("data/evaluation.csv")

# remove outliers with quartile
def remove_outliers(df, col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    df = df[(df[col] >= q1 - 5 * iqr) & (df[col] <= q3 + 5 * iqr)]
    return df

#train_data = remove_outliers(train_data, 'followers_count')

#reset index
#train_data = train_data.reset_index(drop=True)

  from pandas import MultiIndex, Int64Index


Split data

In [2]:
#split data
X_train, y_train = train_data.drop(['retweets_count'], axis=1), train_data['retweets_count']


In [3]:

#mentions is always empty -> DROP Mention
X_train = X_train.drop(['mentions'], axis=1)

#drop verified
X_train= X_train.drop(['verified'], axis=1)

#Tweet id is not relevant -> DROP Tweet id
X_train = X_train.drop(['TweetID'], axis=1)

#add a column to data which counts url
#X_train['url_count'] = X_train['urls'].str.count('http')
#X_test['url_count'] = X_test['urls'].str.count('http')
X_train= X_train.drop(['urls'], axis=1)
#add a column to data which counts hashtags
X_train= X_train.drop(['hashtags'], axis=1)
#drop the text column
X_train = X_train.drop(['text'], axis=1)


#add a column to data which makes followers/friends
X_train['followers_friends'] = X_train['followers_count'] / (X_train['friends_count'].apply(lambda x: x+1))



## Scaler

In [5]:
#MinMax
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)


In [6]:
from flaml import AutoML
automl = AutoML()

settings = {
    "time_budget": 3600,  # total running time in seconds
    "metric": 'mae',  # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
                           # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'
    "task": 'regression',  # task type
    "estimator_list": ['rf'],  # list of ML learners
    "seed": 7654321,    # random seed
}

automl.fit(X_train, y_train, **settings)
# Print the best model
print(automl.model.estimator)

[flaml.automl: 11-17 10:10:33] {2600} INFO - task = regression
[flaml.automl: 11-17 10:10:33] {2602} INFO - Data split method: uniform
[flaml.automl: 11-17 10:10:33] {2605} INFO - Evaluation method: holdout
[flaml.automl: 11-17 10:10:33] {2727} INFO - Minimizing error metric: mae
[flaml.automl: 11-17 10:10:33] {2869} INFO - List of ML learners in AutoML Run: ['rf']
[flaml.automl: 11-17 10:10:33] {3164} INFO - iteration 0, current learner rf
[flaml.automl: 11-17 10:10:33] {3297} INFO - Estimated sufficient time budget=16888s. Estimated necessary time budget=17s.
[flaml.automl: 11-17 10:10:33] {3344} INFO -  at 0.2s,	estimator rf's best error=16.0109,	best estimator rf's best error=16.0109
[flaml.automl: 11-17 10:10:33] {3164} INFO - iteration 1, current learner rf
[flaml.automl: 11-17 10:10:33] {3344} INFO -  at 0.2s,	estimator rf's best error=14.1194,	best estimator rf's best error=14.1194
[flaml.automl: 11-17 10:10:33] {3164} INFO - iteration 2, current learner rf
[flaml.automl: 11-17

RandomForestRegressor(max_features=0.7068879628157007, max_leaf_nodes=32767,
                      n_estimators=16, n_jobs=-1)



AutoML.fit(self, X_train, y_train, dataframe, label, metric, task, n_jobs, log_file_name, estimator_list, time_budget, max_iter, sample, ensemble, eval_method, log_type, model_history, split_ratio, n_splits, log_training_metric, mem_thres, pred_time_limit, train_time_limit, X_val, y_val, sample_weight_val, groups_val, groups, verbose, retrain_full, split_type, learner_selector, hpo_method, starting_points, seed, n_concurrent_trials, keep_search_state, preserve_checkpoint, early_stop, append_log, auto_augment, min_sample_size, use_ray, metric_constraints, custom_hp, cv_score_agg_func, skip_transform, fit_kwargs_by_estimator, **fit_kwargs)


# Submission

In [19]:
# Load the evaluation data
eval_data = pd.read_csv("data/evaluation.csv")

In [20]:
#mentions is always empty -> DROP Mention

eval_data = eval_data.drop(['mentions'], axis=1)
eval_data = eval_data.drop(['hashtags'], axis=1)
eval_data = eval_data.drop(['verified'], axis=1)
eval_data = eval_data.drop(['urls'], axis=1)


#drop the text column
eval_data = eval_data.drop(['text'], axis=1)

#add a column to data which makes followers/friends
eval_data['followers_friends'] = eval_data['followers_count'] / (eval_data['friends_count'].apply(lambda x: x+1))


## Model

In [21]:
# Predict the CoverType for the evaluation dataset
#import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
#import ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesRegressor

model=RandomForestRegressor(max_features=0.7068879628157007, max_leaf_nodes=32767,
                      n_estimators=16, n_jobs=-1)

model.fit(X_train, y_train)

#show top features
importances = model.feature_importances_
print(importances)



[0.71667252 0.03845099 0.03591515 0.02330778 0.14485256 0.04080101]


In [23]:

pred = model.predict(scaler.transform(eval_data.drop(['TweetID'], axis=1)))
#round to integer
pred = np.round(pred)
# Dump the results into a csv file that follows the required Kaggle template
eval_data['retweets_count'] = pred
eval_data[["TweetID", "retweets_count"]].to_csv("data/submission.csv", index=False)