In [11]:
import numpy as np
import pandas as pd
import sklearn.utils

import matplotlib.pylab as plt
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

np.random.seed(101)
rand_seed = 101

n_rows = 1000000

In [49]:
%%time
ROOT = "./csv"
df = pd.read_csv(
    ROOT + "/mergedtweets.csv", 
    encoding='utf-8', 
    low_memory=False, 
    parse_dates=False
)

CPU times: user 33.5 s, sys: 10.1 s, total: 43.6 s
Wall time: 47.9 s


In [50]:
print(df.shape)
print(df.dtypes)
print(df.head())

(2203451, 19)
user_id                    int64
user_key                  object
created_at                 int64
created_str               object
retweet_count              int64
retweeted                   bool
favorite_count             int64
text                      object
tweet_id                   int64
source                    object
hashtags                  object
expanded_urls             object
mentions                  object
retweeted_status_id        int64
in_reply_to_status_id      int64
class                    float64
tokenized_text            object
stem_text                 object
lemma_text                object
dtype: object
      user_id         user_key     created_at          created_str  \
0  2532611755        kathiemrr  1488207240000  2017-02-27 14:54:00   
1  2531159968   traceyhappymom  1471272620000  2016-08-15 14:50:20   
2           0    evewebster373  1435701369000  2015-06-30 21:56:09   
3  4840551713      blacktolive  1474013088000  2016-09-16 08:04:4

### Feature Engineering


In [46]:
def lexical_diversity(text):
    if len(text) == 0:
        diversity = 0
    else: 
        diversity = float(len(set(text))) / len(text)
    return diversity

In [52]:
df['lemma_diversity'] = df['lemma_text'].apply(lexical_diversity)
df['stem_diversity'] = df['stem_text'].apply(lexical_diversity)

print(df.tail())

                    user_id         user_key     created_at  \
2203446  963619824265023488        aviviavai  1518579843952   
2203447  963619824503894016  davidinkuwait69  1518579844009   
2203448  963619824768376833     trumpliesbot  1518579844072   
2203449  963619825229611008        SteveoUSA  1518579844182   
2203450  963619825036783618       RichieRoby  1518579844136   

                            created_str  retweet_count  retweeted  \
2203446  Wed Feb 14 03:44:03 +0000 2018              0      False   
2203447  Wed Feb 14 03:44:04 +0000 2018              0      False   
2203448  Wed Feb 14 03:44:04 +0000 2018              0      False   
2203449  Wed Feb 14 03:44:04 +0000 2018              0      False   
2203450  Wed Feb 14 03:44:04 +0000 2018              0      False   

         favorite_count                                               text  \
2203446               0  b'RT @NicCageMatch: White People Once Kept Bla...   
2203447               0  b'The Ex Resident Obama u

In [48]:
# from ast import literal_eval

# df_test = df.head()
# df_test['lemma_text'] = df_test['lemma_text'].apply(literal_eval)
# df_test['diversity'] = df_test['lemma_text'].apply(lexical_diversity)

# print(df_test.tail())

      user_id         user_key     created_at          created_str  \
0  2532611755        kathiemrr  1488207240000  2017-02-27 14:54:00   
1  2531159968   traceyhappymom  1471272620000  2016-08-15 14:50:20   
2           0    evewebster373  1435701369000  2015-06-30 21:56:09   
3  4840551713      blacktolive  1474013088000  2016-09-16 08:04:48   
4  1694026190  jacquelinisbest  1474227985000  2016-09-18 19:46:25   

   retweet_count  retweeted  favorite_count  \
0              0       True               0   
1              0       True               0   
2              0       True               0   
3             18      False              17   
4              0      False               0   

                                                text            tweet_id  \
0    #ThingsDoneByMistake kissing auntie in the lips  836227891897651200   
1  RT @mc_derpin: #TheOlderWeGet the more pessimi...  765198948239810560   
2  RT @dmataconis: Ready To Feel Like A Failure? ...  61600230657274

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [40]:
print(df_test.head())
print(df_test.tail())

      user_id         user_key     created_at          created_str  \
0  2532611755        kathiemrr  1488207240000  2017-02-27 14:54:00   
1  2531159968   traceyhappymom  1471272620000  2016-08-15 14:50:20   
2           0    evewebster373  1435701369000  2015-06-30 21:56:09   
3  4840551713      blacktolive  1474013088000  2016-09-16 08:04:48   
4  1694026190  jacquelinisbest  1474227985000  2016-09-18 19:46:25   

   retweet_count  retweeted  favorite_count  \
0              0       True               0   
1              0       True               0   
2              0       True               0   
3             18      False              17   
4              0      False               0   

                                                text            tweet_id  \
0    #ThingsDoneByMistake kissing auntie in the lips  836227891897651200   
1  RT @mc_derpin: #TheOlderWeGet the more pessimi...  765198948239810560   
2  RT @dmataconis: Ready To Feel Like A Failure? ...  61600230657274