In [20]:
from pathlib import Path
import warnings

import pandas as pd
import plotnine as pn
import numpy as np
from mizani.formatters import percent_format
from sklearn.metrics import median_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


data_loc = Path('../1-descarga-datos/')
users_file = '2021-08-11-handles-data.csv'
tweet_file = '2021-08-11-2021-08-12-2021-08-19-tweets-data.csv'

pd.set_option('display.max_columns', 500)
warnings.filterwarnings('ignore', module='plotnine')

In [2]:
user_dates = ['join_datetime']
users = pd.read_csv(data_loc / users_file, parse_dates=user_dates)

tweet_dates = ['date']
tweets = pd.read_csv(data_loc / tweet_file, parse_dates=tweet_dates)



In [3]:
users = users.sort_values('followers').drop_duplicates(subset=['id'], keep='last').reset_index(drop=True)

In [4]:
# A veces Pandas es horrible
useless_cols = (tweets.nunique() < 2).reset_index().rename(columns = {0 : 'to_drop'}).query('to_drop')['index'].tolist()

user_text_cols = ['username', 'name', 'bio', 'location', 'avatar', 'background_image', 'url', 'join_date', 'join_time']
tweet_text_cols = ['user_id_str', 'username', 'name', 'link', 'user_rt', 'place']

users = users.drop(user_text_cols, axis=1).reset_index(drop=True)
tweets = tweets.drop(tweet_text_cols + useless_cols, axis=1).reset_index(drop=True)

In [5]:
# Tambien vemos que hay una fecha vacia
users.loc[users.join_datetime == ' ', 'join_datetime'] = np.NaN
users['join_datetime'] = pd.to_datetime(users.join_datetime)

In [6]:
bad_rows = users.isnull().sum(axis=1) == 3

In [7]:
users = users[~bad_rows].reset_index(drop=True)

In [8]:
tweets = tweets[~tweets.retweet].reset_index(drop=True)

In [9]:
tweets['same_convo_id'] = tweets.id == tweets.conversation_id
tweets['has_reply_to'] = tweets.reply_to != '[]'

In [10]:
tweets = tweets[~tweets.has_reply_to].reset_index(drop=True)

In [12]:
tweets['tweet_length'] = tweets.tweet.str.len()

tweets['num_hashtags'] = (tweets.hashtags.str.extractall('(,)').reset_index().groupby('level_0').match.max() + 1)
tweets['num_hashtags'] = tweets.num_hashtags.fillna(0)

tweets['num_cashtags'] = (tweets.cashtags.str.extractall('(,)').reset_index().groupby('level_0').match.max() + 1)
tweets['num_cashtags'] = tweets.num_cashtags.fillna(0)

tweets['num_urls'] = (tweets.urls.str.extractall('(,)').reset_index().groupby('level_0').match.max() + 1)
tweets['num_urls'] = tweets.num_urls.fillna(0)

tweets['num_photos'] = (tweets.photos.str.extractall('(,)').reset_index().groupby('level_0').match.max() + 1)
tweets['num_photos'] = tweets.num_photos.fillna(0)

tweets['has_quotes'] = tweets.quote_url != '0'

tweets['has_thumbnail'] = tweets.thumbnail.notna()

tweets['day'] = tweets.date.dt.date


In [16]:
cont_cols = ['nlikes', 'nreplies', 'nretweets']
keep_users = [col for col in users if col != 'join_datetime']
keep_tweets = ['id', 'tweet', 'user_id', 'day', 'hour', 'language', 'video', 'num_hashtags', 'tweet_length', 'num_photos', 'has_quotes', 'same_convo_id'] + cont_cols

users = users[keep_users].reset_index(drop=True)
tweets = tweets[keep_tweets].reset_index(drop=True)

to_drop = ['id_user']
final_data = tweets.merge(users, left_on='user_id', right_on='id', suffixes=['', '_user']).drop(to_drop, axis=1)

In [18]:
train_days = ['2021-08-12', '2021-08-13', '2021-08-14']
val_days = ['2021-08-15', '2021-08-16']
test_days = ['2021-08-17', '2021-08-18']

final_data['target'] = final_data.nlikes >= 500

train = final_data[final_data.day.astype(str).isin(train_days)].reset_index(drop=True)
val = final_data[final_data.day.astype(str).isin(val_days)].reset_index(drop=True)
test = final_data[final_data.day.astype(str).isin(test_days)].reset_index(drop=True)

print(train.shape)
print(val.shape)
print(test.shape)

(13228, 23)
(7552, 23)
(9675, 23)


### Text

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

# Ojo, haz una mezcla de idiomas
vectorizer = CountVectorizer(min_df=3, strip_accents='unicode', token_pattern=r'\w{1,}', ngram_range=(1, 2), stop_words='english')

vectorizer.fit(train['tweet'])

model = RandomForestClassifier(random_state=1)

model.fit(vectorizer.transform(train['tweet']), train['target'])

predictions = model.predict(vectorizer.transform(test['tweet']))

print((predictions == test['target']).mean())
