# The notebook used to generate the submission on Kaggle

## Load data


In [31]:
import csv
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso

# Load the training data
train_data = pd.read_csv("data/train.csv")

# Load the evaluation data
eval_data = pd.read_csv("data/evaluation.csv")

## Data cleaning


In [32]:
# add new features

train_data['url_count'] = train_data['urls'].str.count('http')
eval_data['url_count'] = eval_data['urls'].str.count('http')

#add followers/friends
train_data['followers_friends'] = train_data['followers_count'] / \
        (train_data['friends_count'].apply(lambda x: x+1))
eval_data['followers_friends'] = eval_data['followers_count'] / \
        (eval_data['friends_count'].apply(lambda x: x+1))  

drop_cols = ["mentions", "TweetID", "urls", "hashtags", "text", "verified"]

ids = eval_data["TweetID"]

train_data = train_data.drop(drop_cols, axis=1)
eval_data = eval_data.drop(drop_cols, axis=1)

## Prediction

In [33]:
# And then we predict the values for our testing set
X, y = train_data.drop(columns=["retweets_count"], axis=1), train_data["retweets_count"]

from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.5)
lasso.fit(X, y)
pred_lasso=lasso.predict(eval_data)
print(lasso.coef_)

err_train=y-lasso.predict(X)
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(max_features=0.55, max_leaf_nodes=33000,
                      n_estimators=53, n_jobs=-1)
rf.fit(X, err_train)
pred_rf=rf.predict(eval_data)
y_pred=pred_lasso+pred_rf
# Restore ids for writing to file
eval_data["TweetID"] = ids

[ 2.33726848e-01  7.54933828e-06 -6.87636381e-06  1.21063698e-03
 -3.86879125e-09  3.25730401e+00 -4.46620328e-04]


In [34]:
def write_predictions_to_file(file_name, eval_data, predictions):
    with open(file_name, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["TweetID", "retweets_count"])
        for index, prediction in enumerate(predictions):
            writer.writerow([str(eval_data['TweetID'].iloc[index]) , str(round(prediction))])

In [35]:
write_predictions_to_file("submission.csv", eval_data, y_pred)