# Deep Neural Network


In [46]:
import numpy as np
import pandas as pd
import sns as sns
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit  # pip install verstack


# Load the training data
train_data = pd.read_csv("data/train.csv")
train_data=train_data.drop(['mentions'], axis=1)

# Load the evaluation data
eval_data = pd.read_csv("data/evaluation.csv")
eval_data=eval_data.drop(['mentions'], axis=1)

# split data
X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweets_count'], stratify=train_data['retweets_count'], train_size=0.8, test_size=0.2)
# We remove the actual number of retweets from our features since it is the value that we are trying to predict
X_train = X_train.drop(['retweets_count'], axis=1)
X_test = X_test.drop(['retweets_count'], axis=1)

# Extracting features


## Time


In [47]:
from datetime import datetime
def extract_time_features(df):
    rs_df = df
    rs_df["hour"] = rs_df['timestamp'].apply(
        lambda t: (datetime.fromtimestamp(t//1000).hour))
    rs_df["day"] = rs_df['timestamp'].apply(
        lambda t: (datetime.fromtimestamp(t//1000)).weekday())
    rs_df["week_in_month"] = rs_df['timestamp'].apply(
        lambda t: (datetime.fromtimestamp(t//1000).day)//7)  
    rs_df=rs_df.drop(['timestamp'], axis=1)
    return rs_df

## Metric transformation

In [48]:

def extract_ratio_features(df):
    rs_df = df
    rs_df['followers__favorites'] = rs_df['followers_count'] * rs_df['favorites_count']
    rs_df['friends__favorites'] = rs_df['friends_count'] * rs_df['favorites_count']
    rs_df['followers__friends__favorites'] = rs_df['followers_count'] * rs_df['friends_count'] * rs_df['favorites_count']
    return rs_df

from scipy.stats import norm
from scipy.stats import zscore
def extract_transfo(df,columns):
    rs_df = df
    for col in columns:
        mean = rs_df[col].mean()
        std = rs_df[col].std()
        rs_df[col+'_cdf'] = norm.cdf(rs_df[col].values, loc=mean, scale=std)
        rs_df[col+'_z'] = zscore(rs_df[col].values)       
        rs_df[col+'_rank'] = rs_df[col].rank(method='min')
        rs_df[col+'_log'] = (df[col] + 1).apply(np.log)
    return rs_df
    

## Text

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
def extract_topic(df):
    rs_df = df
    rs_df['hashtags'] = rs_df['hashtags'].apply(
        lambda x: x.replace('[', '').replace(']', '').replace("'", ''))
    #join text and hashtags
    rs_df['total_text'] = rs_df['text'] + ' ' + rs_df['hashtags']
    vectorizer = TfidfVectorizer(min_df=1, max_features=None, stop_words=stopwords.words('french'))
    vector = vectorizer.fit_transform(rs_df['text'])
    svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
    svd.fit(vector)
    topic=svd.transform(vector)
    rs_df['topic_1'] = topic[:,0]
    rs_df['topic_2'] = topic[:,1]
    rs_df['topic_3'] = topic[:,2]
    rs_df['topic_4'] = topic[:,3]
    rs_df['topic_5'] = topic[:,4]
    rs_df=rs_df.drop(['hashtags'],axis=1)
    rs_df=rs_df.drop(['total_text'],axis=1)
    return rs_df

from textblob import TextBlob  # pip install textblob-fr
from textblob_fr import PatternTagger, PatternAnalyzer


def sent_engineering(in_df):
    rs_df = in_df
    # add columns related to sentiment analysis
    rs_df['polarity'] = rs_df['text'].apply(lambda x: TextBlob(
        x, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()).sentiment[0])
    rs_df['subjectivity'] = rs_df['text'].apply(lambda x: TextBlob(
        x, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()).sentiment[1])
    # drop the text column
    rs_df = rs_df.drop(['text'], axis=1)
    return rs_df

def extract_url(in_df):
    #count url
    rs_df = in_df
    rs_df['url_count'] = rs_df['urls'].apply(lambda x: len(x.split(',')))
    rs_df=rs_df.drop(['urls'],axis=1)

    return rs_df


## Cluster

In [50]:
#import kmeans
from sklearn.cluster import KMeans
def extract_cluster(df,columns):
    rs_df = df
    rs_df['cluster'] = KMeans(n_clusters=100, random_state=0).fit_predict(rs_df[columns].values)
    return rs_df

## Our categories of features

In [51]:
tweet_metrics_features = [
    'followers_count', 'friends_count', 'favorites_count',
    'followers__favorites', 'friends__favorites', 'followers__friends__favorites',
]

tweet_metrics_log_features = [feat+'_log' for feat in tweet_metrics_features]
tweet_metrics_cdf_features = [feat+'_cdf' for feat in tweet_metrics_features]
tweet_metrics_z_features = [feat+'_z' for feat in tweet_metrics_features]
tweet_metrics_rank_features = [feat+'_rank' for feat in tweet_metrics_features]
time_cat_features = ['hour', 'day', 'week_of_month']
text_features = ['subjectivity', 'polarity', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5']
other_features = ['url_count', 'cluster']


# Preparation of dataset

In [52]:


new_X_train=extract_time_features(X_train)
new_X_train=extract_ratio_features(new_X_train)
new_X_train=extract_transfo(new_X_train,tweet_metrics_features)
new_X_train=extract_topic(new_X_train)
new_X_train=sent_engineering(new_X_train)
new_X_train=extract_url(new_X_train)
new_X_train=extract_cluster(new_X_train,['followers_count', 'friends_count', 'favorites_count','statuses_count'])



col=new_X_train.columns


           

        

In [53]:
new_X_test=extract_time_features(X_test)
new_X_test=extract_ratio_features(new_X_test)
new_X_test=extract_transfo(new_X_test,tweet_metrics_features)
new_X_test=extract_topic(new_X_test)
new_X_test=sent_engineering(new_X_test)
new_X_test=extract_url(new_X_test)
new_X_test=extract_cluster(new_X_test,['followers_count', 'friends_count', 'favorites_count','statuses_count'])



In [54]:



from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler #standard scaler seems to perform better

scaler = StandardScaler()
scaler.fit(new_X_train)
new_X_train_s = scaler.transform(new_X_train)
new_X_test_s = scaler.transform(new_X_test)

print(type(new_X_train_s))
#array to pd
new_X_train_s=pd.DataFrame(new_X_train_s, columns=col)
new_X_test_s=pd.DataFrame(new_X_test_s, columns=col)
#rindex columns
new_X_train_s=new_X_train_s.reindex(columns=col)
new_X_test_s=new_X_test_s.reindex(columns=col)
print(new_X_train_s.head(1))



<class 'numpy.ndarray'>
   favorites_count  followers_count  statuses_count  friends_count  verified  \
0        -0.055427        -0.077747       -0.414041      -0.559132 -0.175892   

    TweetID      hour       day  week_in_month  followers__favorites  ...  \
0  1.518406 -0.707831  1.035836       0.736276             -0.018636  ...   

   followers__friends__favorites_log   topic_1  topic_2   topic_3   topic_4  \
0                          -0.598136 -0.296151  -0.0715  0.296761 -0.339997   

   topic_5  polarity  subjectivity  url_count   cluster  
0  2.19863  0.686551      0.317495  -0.071037 -0.317596  

[1 rows x 45 columns]


# DNN

In [57]:
#import sequential
from keras.models import Sequential
from keras.layers import Dense, Dropout
from tensorflow.keras import regularizers

def build_dnn_model(input):
    model = Sequential()
    model.add(Dense(2048, input_dim=input.shape[1], activation='relu', kernel_regularizer=regularizers.L2(l2=0.0001)))
    model.add(Dropout(0.3))
    model.add(Dense(512, activation='relu', kernel_regularizer=regularizers.L2(l2=0.0001)))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation='relu',kernel_regularizer=regularizers.L2(l2=0.0001)))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='relu', kernel_regularizer=regularizers.L2(l2=0.0001)))

    model.compile(loss='mae', optimizer='adam', metrics=['mae'])
    return model


remove=['TweetID']+tweet_metrics_rank_features+tweet_metrics_z_features+tweet_metrics_cdf_features+tweet_metrics_log_features

print('dropping useless columns:', remove)

print('Training model...')
model = build_dnn_model(
    new_X_train_s.drop(remove, axis=1))
history = model.fit(
    new_X_train_s.drop(remove,axis=1), y_train, epochs=12,  verbose=1, shuffle=True)

dropping useless columns: ['TweetID', 'followers_count_rank', 'friends_count_rank', 'favorites_count_rank', 'followers__favorites_rank', 'friends__favorites_rank', 'followers__friends__favorites_rank', 'followers_count_z', 'friends_count_z', 'favorites_count_z', 'followers__favorites_z', 'friends__favorites_z', 'followers__friends__favorites_z', 'followers_count_cdf', 'friends_count_cdf', 'favorites_count_cdf', 'followers__favorites_cdf', 'friends__favorites_cdf', 'followers__friends__favorites_cdf', 'followers_count_log', 'friends_count_log', 'favorites_count_log', 'followers__favorites_log', 'friends__favorites_log', 'followers__friends__favorites_log']
Training model...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [59]:
y_pred = model.predict(new_X_test_s.drop(remove, axis=1))
print(mean_absolute_error(y_test, y_pred))

7.235647015027092
