In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale, normalize
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
%matplotlib inline

In [2]:
def read_train(filename='data/train.csv'):
    return pd.read_csv(filename, encoding='utf-8', dialect='excel', lineterminator='\n')


def read_test(filename='data/test.csv'):
    return pd.read_csv(filename, encoding='utf-8', dialect='excel', lineterminator='\n')

In [3]:
class Features:
    @staticmethod
    def get_user_lang_feature(data, train_data):
        table = train_data[['user.lang', 'retweet_count']].groupby(by='user.lang').mean()
        table = table.to_dict()['retweet_count']
        return pd.Series.from_array([table.get(x, np.mean(list(table.values()))) for x in data['user.lang']])

    @staticmethod
    def get_text_feature(data, i):
        if i == 0:
            return pd.Series.from_array([x.count('http') for x in data['text']])
        elif i == 1:
            return pd.Series.from_array([x.count('@') for x in data['text']])
        elif i == 2:
            return pd.Series.from_array([x.count('#') for x in data['text']])
        elif i == 3:
            return pd.Series.from_array([len(x.split()) for x in data['text']])
        elif i == 4:
            return pd.Series.from_array([x.count('?') for x in data['text']])

    @staticmethod
    def get_in_reply_to_user_id_feature(data):
        return pd.Series.from_array([np.bool(x) for x in data['in_reply_to_user_id']])
    
    @staticmethod
    def get_user_description_feature(data, i):
        if i == 0:
            return pd.Series.from_array([x.count('http') for x in data['text']])
        elif i == 1:
            return pd.Series.from_array([x.count('@') for x in data['text']])
        elif i == 2:
            return pd.Series.from_array([x.count('#') for x in data['text']])
        elif i == 3:
            return pd.Series.from_array([len(x.split()) for x in data['text']])
        elif i == 4:
            return pd.Series.from_array([x.count('?') for x in data['text']])
    
    @staticmethod
    def get_user_time_zone_feature(data, train_data):
        table = train_data[['user.time_zone', 'retweet_count']].groupby(by='user.time_zone').mean()
        table = table.to_dict()['retweet_count']
        return pd.Series.from_array([table.get(x, np.mean(list(table.values()))) for x in data['user.time_zone']])


def df2features(data, train_data):
    return np.array([
        Features.get_text_feature(data, 0),
        Features.get_text_feature(data, 1),
        Features.get_text_feature(data, 2),
        Features.get_text_feature(data, 3),
        Features.get_text_feature(data, 4),
        data['in_reply_to_user_id'],
        Features.get_user_description_feature(data, 0),
        Features.get_user_description_feature(data, 1),
        Features.get_user_description_feature(data, 2),
        Features.get_user_description_feature(data, 3),
        Features.get_user_description_feature(data, 4),
        Features.get_user_lang_feature(data, train_data),
        Features.get_user_time_zone_feature(data, train_data),
        data['user.utc_offset'],
        data['user.statuses_count'],
        data['user.followers_count'],
        data['user.friends_count'],
        data['user.favourites_count'],
        data['user.is_translation_enabled'],
        data['user.geo_enabled'],
        data['user.listed_count']
    ]).transpose()

In [4]:
class Models:
    @staticmethod
    def model(model, param_grid, train_X, test_X, train_y, test_y, real_test_X):
        est = GridSearchCV(model, param_grid=param_grid, n_jobs=4)
        est.fit(train_X, train_y)

        proba_train = est.predict_proba(train_X)
        print(roc_auc_score(train_y, proba_train[:, 1]))

        proba_test = est.predict_proba(test_X)
        print(roc_auc_score(test_y, proba_test[:, 1]))

        proba_real_test = est.predict_proba(real_test_X)
        return proba_train[:, 1], proba_test[:, 1], proba_real_test[:, 1]

In [5]:
data, test_data = read_train(), read_test()
data_y = data['retweet_count'] > 20
train_X, test_X, train_y, test_y = train_test_split(data, data_y, test_size=0.33)
train_X, test_X, real_test_X = df2features(train_X, train_X), \
                               df2features(test_X, train_X), df2features(test_data, train_X)

In [6]:
model = ExtraTreesClassifier()
model.fit(train_X, train_y)
print(model.feature_importances_)

[ 0.01530391  0.02063559  0.02066867  0.06398109  0.0079293   0.01864598
  0.0172736   0.02019353  0.02170001  0.06350632  0.00787501  0.00899759
  0.04646124  0.04179874  0.07963917  0.18212388  0.06911849  0.07183471
  0.04342892  0.01896239  0.15992187]


In [7]:
_, _, proba = Models.model(RandomForestClassifier(), {'n_estimators': [i for i in range(80, 250, 30)],
                                                      'criterion':  ['gini', 'entropy'], 
                                                      'min_samples_leaf': [i for i in range(5, 101, 10)]}, 
                           train_X, test_X, train_y, test_y, real_test_X)

In [8]:
_, _, proba2 = Models.model(GradientBoostingClassifier(), {'n_estimators': [i for i in range(80, 250, 30)],
                                                           'criterion': ['friedman_mse', 'mse'],
                                                           'min_samples_leaf': [i for i in range(5, 101, 10)]},
                            train_X, test_X, train_y, test_y, real_test_X)

0.931354731834
0.920695873928


In [22]:
_, _, proba3 = Models.model(SVC(probability=True, verbose=True), {'kernel': ['poly', 'rbf']}, 
                      train_X, test_X, train_y, test_y, real_test_X)

In [9]:
prediction = pd.DataFrame(data={'id': test_data['id'], 'probability': proba2})
prediction.to_csv('data/prediction.csv', index=False)
print(prediction.head())

                   id  probability
0  629692042952765440     0.003357
1  629692042717855745     0.002738
2  629692039974813696     0.003716
3  629692038242566145     0.793212
4  629692036879413248     0.004685
