In [4]:
import numpy as np
import pandas as pd
import sklearn.utils

import matplotlib.pylab as plt
from tqdm import tqdm_notebook as tqdm

from sklearn.feature_extraction.text import TfidfVectorizer

np.random.seed(101)
rand_seed = 101

In [5]:
%%time
ROOT = "./csv"
df = pd.read_csv(
    ROOT + "/propertweets.csv", 
    encoding='utf-8', 
    low_memory=False, 
    parse_dates=False
)

Wall time: 2min 12s


In [8]:
del df['Unnamed: 0']
print(df.shape)
print(df.dtypes)
print(df.head())

(10000000, 18)
user_id                   int64
user_key                 object
created_at                int64
created_str              object
retweet_count             int64
retweeted                  bool
favorite_count            int64
text                     object
tweet_id                  int64
source                   object
hashtags                 object
expanded_urls            object
mentions                 object
retweeted_status_id       int64
in_reply_to_status_id     int64
tokens                   object
stems                    object
lemmas                   object
dtype: object
              user_id         user_key     created_at  \
0  961701459900342272  Jeannie22757716  1518122470246   
1  961701477164105728        _p_body__  1518122474362   
2  961701487507312641   6728FixerUpper  1518122476828   
3  961701511289073665        LenAulett  1518122482498   
4  961701912880988161       Dsquared69  1518122578245   

                      created_str  retweet_count  re

### Feature Engineering


In [46]:
def lexical_diversity(text):
    if len(text) == 0:
        diversity = 0
    else: 
        diversity = float(len(set(text))) / len(text)
    return diversity

In [52]:
df['lemma_diversity'] = df['lemma_text'].apply(lexical_diversity)
df['stem_diversity'] = df['stem_text'].apply(lexical_diversity)

print(df.tail())

                    user_id         user_key     created_at  \
2203446  963619824265023488        aviviavai  1518579843952   
2203447  963619824503894016  davidinkuwait69  1518579844009   
2203448  963619824768376833     trumpliesbot  1518579844072   
2203449  963619825229611008        SteveoUSA  1518579844182   
2203450  963619825036783618       RichieRoby  1518579844136   

                            created_str  retweet_count  retweeted  \
2203446  Wed Feb 14 03:44:03 +0000 2018              0      False   
2203447  Wed Feb 14 03:44:04 +0000 2018              0      False   
2203448  Wed Feb 14 03:44:04 +0000 2018              0      False   
2203449  Wed Feb 14 03:44:04 +0000 2018              0      False   
2203450  Wed Feb 14 03:44:04 +0000 2018              0      False   

         favorite_count                                               text  \
2203446               0  b'RT @NicCageMatch: White People Once Kept Bla...   
2203447               0  b'The Ex Resident Obama u

## tf-idf: term frequency-inverse document frequency

In [10]:
vectorizer = TfidfVectorizer(min_df=5, analyzer='word', ngram_range=(1, 2), stop_words='english')
vz = vectorizer.fit_transform(list(df['tokens'].map(lambda tokens: ' '.join(tokens))))

vz.shape

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']
tfidf.tfidf.hist(bins=25, figsize=(15,7))