# Twitter Data Cleaning

___

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span><ul class="toc-item"><li><span><a href="#Import-Libraries" data-toc-modified-id="Import-Libraries-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Import Libraries</a></span></li><li><span><a href="#Increase-Max-Rows,-Columns,-Width" data-toc-modified-id="Increase-Max-Rows,-Columns,-Width-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Increase Max Rows, Columns, Width</a></span></li></ul></li><li><span><a href="#Import-Data" data-toc-modified-id="Import-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import Data</a></span></li><li><span><a href="#Convert-To-DataFrames" data-toc-modified-id="Convert-To-DataFrames-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Convert To DataFrames</a></span><ul class="toc-item"><li><span><a href="#Tweets" data-toc-modified-id="Tweets-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Tweets</a></span></li><li><span><a href="#Deets" data-toc-modified-id="Deets-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Deets</a></span></li><li><span><a href="#Users" data-toc-modified-id="Users-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Users</a></span></li></ul></li></ul></div>

___

## Setup

### Import Libraries

In [1]:
import sys
sys.path.append("..")
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import pandas as pd
import requests
import json
import math
import sklearn
from scipy import stats
from scipy.stats import norm
from sklearn.utils import resample
import pickle
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
from wordcloud import WordCloud
import random
from collections import Counter
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, Lasso, Ridge, LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import roc_curve, auc, confusion_matrix
import scipy.stats as stats

### Increase Max Rows, Columns, Width

In [111]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

___

## Import Data

In [91]:
with open('../Data/search_tweets.json') as a:
    tweets_data = json.load(a)
with open('../Data/tweets_deets.json') as b:
    deets = json.load(b)
with open('../Data/users.json') as c:
    users = json.load(c)

___

## Convert To DataFrames

### Tweets

In [27]:
keys = list(tweets['statuses'][0].keys())
keys

['created_at',
 'id',
 'id_str',
 'text',
 'truncated',
 'entities',
 'metadata',
 'source',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'in_reply_to_screen_name',
 'user',
 'geo',
 'coordinates',
 'place',
 'contributors',
 'retweeted_status',
 'is_quote_status',
 'quoted_status_id',
 'quoted_status_id_str',
 'retweet_count',
 'favorite_count',
 'favorited',
 'retweeted',
 'lang']

In [158]:
tweets = pd.DataFrame()
tweets['tweet_created_at'] = list(map(lambda tweet: tweet['created_at'], tweets_data['statuses']))
tweets['tweet_id'] = list(map(lambda tweet: tweet['id'], tweets_data['statuses']))
tweets['tweet_id_str'] = list(map(lambda tweet: tweet['id_str'], tweets_data['statuses']))
# tweets['tweet_text'] = list(map(lambda tweet: tweet['text'] if 'extended_tweet' not in tweet else tweet['extended_tweet']['full_text'], tweets_data['statuses'])) #unnecessary - full tweet text comes from "deets"
# tweets['truncated'] = list(map(lambda tweet: tweet['truncated'], tweets_data['statuses'])) #unnecessary column
tweets['tweet_entities'] = list(map(lambda tweet: tweet['entities'], tweets_data['statuses']))
tweets['tweet_source'] = list(map(lambda tweet: tweet['source'], tweets_data['statuses']))
tweets['tweet_in_reply_to_status_id'] = list(map(lambda tweet: tweet['in_reply_to_status_id'], tweets_data['statuses']))
tweets['tweet_in_reply_to_status_id_str'] = list(map(lambda tweet: tweet['in_reply_to_status_id_str'], tweets_data['statuses']))
tweets['tweet_in_reply_to_user_id'] = list(map(lambda tweet: tweet['in_reply_to_user_id'], tweets_data['statuses']))
tweets['tweet_in_reply_to_user_id_str'] = list(map(lambda tweet: tweet['in_reply_to_user_id_str'], tweets_data['statuses']))
tweets['tweet_in_reply_to_screen_name'] = list(map(lambda tweet: tweet['in_reply_to_screen_name'], tweets_data['statuses']))
tweets['tweet_in_reply_to_user_id_str'] = list(map(lambda tweet: tweet['in_reply_to_user_id_str'], tweets_data['statuses']))
tweets['tweet_in_reply_to_user_id_str'] = list(map(lambda tweet: tweet['in_reply_to_user_id_str'], tweets_data['statuses']))
tweets['tweet_in_reply_to_user_id_str'] = list(map(lambda tweet: tweet['in_reply_to_user_id_str'], tweets_data['statuses']))
# tweets['user_screen_name'] = list(map(lambda tweet: tweet['user']['screen_name'], tweets_data['statuses'])) #unnecessary column - this comes from "users"
tweets['tweet_location'] = list(map(lambda tweet: tweet['user']['location'], tweets_data['statuses']))
tweets['tweet_coordinates'] = list(map(lambda tweet: tweet['coordinates'], tweets_data['statuses']))
tweets['tweet_geo'] = list(map(lambda tweet: tweet['geo'], tweets_data['statuses']))
tweets['tweet_place'] = list(map(lambda tweet: tweet['place'], tweets_data['statuses']))
tweets['tweet_contributors'] = list(map(lambda tweet: tweet['contributors'], tweets_data['statuses']))
tweets['tweet_retweeted_status'] = list(map(lambda tweet: tweet['retweeted_status'], tweets_data['statuses']))
tweets['tweet_is_quote_status'] = list(map(lambda tweet: tweet['is_quote_status'], tweets_data['statuses']))
# tweets['quoted_status_id'] = list(map(lambda tweet: tweet['quoted_status_id'], tweets_data['statuses']))
# tweets['quoted_status_id_str'] = list(map(lambda tweet: tweet['quoted_status_id_str'], tweets_data['statuses']))
tweets['tweet_retweeted_status'] = list(map(lambda tweet: tweet['retweeted_status'], tweets_data['statuses']))
tweets['tweet_retweet_count'] = list(map(lambda tweet: tweet['retweet_count'], tweets_data['statuses'])) #redundant column - this comes from "deets"
tweets['tweet_favorite_count'] = list(map(lambda tweet: tweet['favorite_count'], tweets_data['statuses']))
tweets['tweet_favorited'] = list(map(lambda tweet: tweet['favorited'], tweets_data['statuses']))
tweets['tweet_retweeted'] = list(map(lambda tweet: tweet['retweeted'], tweets_data['statuses']))
tweets['tweet_language'] = list(map(lambda tweet: tweet['lang'], tweets_data['statuses']))

In [159]:
tweets.T

Unnamed: 0,0,1
tweet_created_at,Fri Jan 31 00:22:27 +0000 2020,Fri Jan 31 00:22:26 +0000 2020
tweet_id,1223038801519562752,1223038795181723648
tweet_id_str,1223038801519562752,1223038795181723648
tweet_entities,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'hashtags': [], 'symbols': [], 'user_mentions..."
tweet_source,"<a href=""http://twitter.com/download/iphone"" r...","<a href=""http://twitter.com/download/iphone"" r..."
tweet_in_reply_to_status_id,,
tweet_in_reply_to_status_id_str,,
tweet_in_reply_to_user_id,,
tweet_in_reply_to_user_id_str,,
tweet_in_reply_to_screen_name,,


### Deets

In [150]:
deets['data'][0].keys()

dict_keys(['attachments', 'author_id', 'created_at', 'entities', 'id', 'lang', 'possibly_sensitive', 'source', 'stats', 'text', 'format'])

In [125]:
deets['data'][0]['stats'].keys()

dict_keys(['retweet_count', 'reply_count', 'like_count', 'quote_count'])

In [132]:
deets_df = pd.DataFrame()

# deets_df['attachments'] = list(map(lambda tweet: tweet['attachments'], deets['data']))
deets_df['author_id'] = list(map(lambda tweet: tweet['author_id'], deets['data']))
deets_df['created_at'] = list(map(lambda tweet: tweet['created_at'], deets['data']))
deets_df['tweet_id'] = list(map(lambda tweet: tweet['id'], deets['data']))
deets_df['lang'] = list(map(lambda tweet: tweet['lang'], deets['data']))
deets_df['possibly_sensitive'] = list(map(lambda tweet: tweet['possibly_sensitive'], deets['data']))
# deets_df['source'] = list(map(lambda tweet: tweet['source'], deets['data'])) #unnecessary column
# deets_df['stats'] = list(map(lambda tweet: tweet['stats'], deets['data'])) #unnecessary column - extracting stats keys below
deets_df['text'] = list(map(lambda tweet: tweet['text'], deets['data']))
# deets_df['format'] = list(map(lambda tweet: tweet['format'], deets['data'])) #unnecessary column

#extracted from "stats" key:
deets_df['retweet_count'] = list(map(lambda tweet: tweet['stats']['retweet_count'], deets['data']))
deets_df['reply_count'] = list(map(lambda tweet: tweet['stats']['reply_count'], deets['data']))
deets_df['like_count'] = list(map(lambda tweet: tweet['stats']['like_count'], deets['data']))
deets_df['quote_count'] = list(map(lambda tweet: tweet['stats']['quote_count'], deets['data']))

#feature engineering:
deets_df['text_length'] = list(map(lambda tweet: len(tweet['text']), deets['data']))

In [133]:
deets_df

Unnamed: 0,author_id,created_at,tweet_id,lang,possibly_sensitive,text,retweet_count,reply_count,like_count,quote_count,text_length
0,25073877,2020-01-31T00:19:23.000Z,1223038027234267137,en,False,"Great poll in Iowa, where I just landed for a ...",2406,1346,8517,192,91
1,783214,2020-01-30T20:00:13.000Z,1222972807639896064,en,False,"Yes, it's still January",28532,3049,77352,4354,23
2,409486555,2020-01-20T13:56:40.000Z,1219257438949597185,en,False,"To honor Dr. King's legacy, we all can play a ...",10107,508,44343,253,303
3,14130366,2020-01-26T05:08:25.000Z,1221298825786089472,en,False,"This Black History Month, we’ll be celebrating...",153,56,1068,8,284
4,1636590253,2020-01-27T15:15:10.000Z,1221813908576464896,en,False,"Today, we remember the millions of lives lost....",678,119,5556,32,273


### Users

In [151]:
users['data'][0].keys()

dict_keys(['created_at', 'description', 'entities', 'id', 'location', 'most_recent_tweet_id', 'name', 'pinned_tweet_id', 'profile_image_url', 'protected', 'stats', 'url', 'username', 'verified', 'format'])

In [137]:
users['data'][0]['stats'].keys()

dict_keys(['followers_count', 'following_count', 'tweet_count', 'listed_count'])

In [145]:
users_df = pd.DataFrame()

users_df['user_created_at'] = list(map(lambda tweet: tweet['created_at'], users['data']))
users_df['user_description'] = list(map(lambda tweet: tweet['description'], users['data']))
# users_df['user_entities'] = list(map(lambda tweet: tweet['entities'], users['data'])) #unnecessary column
users_df['user_id'] = list(map(lambda tweet: tweet['id'], users['data']))
# users_df['user_location'] = list(map(lambda tweet: tweet['location'], users['data'])) #GET THIS TO WORK
users_df['user_real_name'] = list(map(lambda tweet: tweet['name'], users['data']))
# users_df['user_pinned_tweet_id'] = list(map(lambda tweet: tweet['pinned_tweet_id'], users['data'])) #GET THIS TO WORK
users_df['user_profile_image_url'] = list(map(lambda tweet: tweet['profile_image_url'], users['data']))
users_df['user_protected'] = list(map(lambda tweet: tweet['protected'], users['data']))
# users_df['user_stats'] = list(map(lambda tweet: tweet['stats'], users['data'])) #unnecessary column - extracting stats keys below
users_df['user_url'] = list(map(lambda tweet: tweet['url'], users['data']))
users_df['username'] = list(map(lambda tweet: tweet['username'], users['data']))
users_df['verified'] = list(map(lambda tweet: tweet['verified'], users['data']))
users_df['format'] = list(map(lambda tweet: tweet['format'], users['data']))

#extracted from "stats" key:
users_df['user_followers_count'] = list(map(lambda tweet: tweet['stats']['followers_count'], users['data']))
users_df['user_following_count'] = list(map(lambda tweet: tweet['stats']['following_count'], users['data']))
users_df['user_tweet_count'] = list(map(lambda tweet: tweet['stats']['tweet_count'], users['data']))
users_df['user_listed_count'] = list(map(lambda tweet: tweet['stats']['listed_count'], users['data']))

#feature engineering:
users_df['user_description_length'] = list(map(lambda tweet: len(tweet['description']), users['data']))

In [160]:
users_df

Unnamed: 0,user_created_at,user_description,user_id,user_real_name,user_profile_image_url,user_protected,user_url,username,verified,format,user_followers_count,user_following_count,user_tweet_count,user_listed_count,user_description_length
0,2009-03-18T13:46:38.000Z,45th President of the United States of America🇺🇸,25073877,Donald J. Trump,https://pbs.twimg.com/profile_images/874276197...,False,https://t.co/OMxB0x7xC5,realDonaldTrump,True,detailed,71873110,47,48500,113961,48
1,2007-02-20T14:35:54.000Z,What’s happening?!,783214,Twitter,https://pbs.twimg.com/profile_images/111172963...,False,https://t.co/TAXQpsHa5X,Twitter,True,detailed,57135266,1,12909,90552,18
2,2011-11-10T20:13:01.000Z,Girl from the South Side and former First Lady...,409486555,Michelle Obama,https://pbs.twimg.com/profile_images/119281123...,False,https://t.co/0UVvR5L6vm,MichelleObama,True,detailed,14331923,18,1225,24897,96
3,2008-03-12T05:51:53.000Z,"CEO, Google and Alphabet",14130366,Sundar Pichai,https://pbs.twimg.com/profile_images/864282616...,False,,sundarpichai,True,detailed,2636216,337,1316,6747,25
4,2013-07-31T22:41:25.000Z,Apple CEO  Auburn 🏀 🏈 Duke 🏀 National Parks 🏞...,1636590253,Tim Cook,https://pbs.twimg.com/profile_images/119411373...,False,,tim_cook,True,detailed,11677552,68,976,21430,135
