# Exploratory Data Analysis

This notebook performs an initial EDA based on the sample data extracted during the data pull and etl steps.

In [1]:
import os
from os.path import join
import eland as ed
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

project_dir = join(os.getcwd(), os.pardir)
raw_dir = join(project_dir, 'data', 'raw')
interim_dir = join(project_dir, 'data', 'interim')
db_name = 'data_pull_sample.db'

%config InlineBackend.figure_format = 'svg'

In [2]:
ed_df = ed.read_es('localhost', 'twitter')
df_twt = ed_df.to_pandas()
df_users = df_twt.loc[
    :,
    list(df_twt.columns[df_twt.columns.str.startswith('user')]) + ['verified', 'protected']
].copy().drop_duplicates('user_id')

df_twt['is_original'] = ~df_twt[['is_retweet', 'is_quote_status', 'is_reply']].max(1)

## EDA Questions:
- How many tweets are in the dataset?
- How many unique tweets are in the dataset?
- How many unique users?
- Number of retweets
- Locate the top N retweeted and liked tweets and the users that posted them
- What are the locations? (Number of tweets by location)
- What are the main languages? What are the counts and proportions of tweets by languages?
- Location over time
- Source analysis
- Protected
- Verified

___
## Tweet Analysis
### How many tweets are in the dataset? How many unique tweets are in the dataset? Number of retweets?

In [3]:
print(f"""
Total number of Tweets:\t\t{df_twt.shape[0]}
Number of original Tweets:\t{df_twt.is_original.sum()}
Number of retweets:\t\t{df_twt.is_retweet.sum()}
Number of replies:\t\t{df_twt.is_reply.sum()}
Number of quotes:\t\t{df_twt.is_quote_status.sum()}

Note: Many of the quote statuses/tweets are both marked as quotes and retweets.
""")


Total number of Tweets:		474419
Number of original Tweets:	94018
Number of retweets:		361077
Number of replies:		14478
Number of quotes:		14190

Note: Many of the quote statuses/tweets are both marked as quotes and retweets.



### Locate the top N retweeted and liked tweets and the users that posted them

In [4]:
N = 20
df_twt[df_twt.is_original==True]\
    .nlargest(N, 'retweet_count')\
    [['tweet_id', 'full_text', 'retweet_count', 'name']]\
    .reset_index(drop=True)

Unnamed: 0,tweet_id,full_text,retweet_count,name
0,1268202497216319488,https://t.co/sFwcDyxcgA,59487,Ratan N. Tata
1,1262797667677212672,We need a president who believes in science.,42995,Joe Biden
2,1263213348000325632,u up? @NASA,27047,Twitter
3,1263091517767536640,Well done @RubikaLiyaquat for standing up for ...,26243,Sudhir Chaudhary
4,1263828899575758848,"15 yr old Jyoti Kumari, carried her wounded fa...",24152,Ivanka Trump
5,1263815469145788416,Deeply saddened by the loss of life due to a p...,21544,Narendra Modi
6,1263516296093863936,"If you reply to this tweet I’ll give you $10,000!",18841,MrBeast
7,1266152680428236800,I have lost control of the situation.,18489,God
8,1264121291365216256,घटना : मध्यप्रदेश के छिंदवाड़ा जिले के पिपला थ...,16721,Devvesh Pandey | देवेश पांडेय | دیویش پانڈے۔
9,1263320513276817408,"एक सच्चे देशभक्त,उदार और परोपकारी पिता के पुत्...",16609,Rahul Gandhi


### What are the locations? (Number of tweets by location)

In [8]:
df_twt.columns[df_twt.columns.str.contains('loca')]

Index(['location', 'users_derived_locality'], dtype='object')

In [13]:
no_loc_count = df_twt['location'].isnull().sum()
print(f"""Please note that {no_loc_count} tweets (~{int(no_loc_count/df_twt.shape[0]*100)}% \
of the dataset) do not contain location info.""")

top_locations = df_twt.groupby('location')\
    .size()\
    .to_frame()\
    .reset_index()\
    .rename(columns={0:'Count', 'location':'Location'})\
    .set_index('Location')\
    .nlargest(20, 'Count')

top_locations

Please note that 140614 tweets (~29% of the dataset) do not contain location info.


Unnamed: 0_level_0,Count
Location,Unnamed: 1_level_1
India,31746
"Kolkata, India",14966
"New Delhi, India",12825
"Mumbai, India",6983
"West Bengal, India",4748
Kolkata,4061
New Delhi,3223
"Bengaluru, India",3060
भारत,2629
Mumbai,2472


### What are the main languages? What are the counts and proportions of tweets by languages?

In [14]:
no_loc_count = df_twt.lang.isnull().sum()
print(f"""Please note that {no_loc_count} tweets (~{int(no_loc_count/df_twt.shape[0]*100)}% \
of the dataset) do not contain tweet language info.""")

df_twt.groupby('lang')\
    .size()\
    .to_frame()\
    .reset_index()\
    .rename(columns={0:'Count', 'lang':'Language'})\
    .set_index('Language')\
    .nlargest(20, 'Count')

Please note that 0 tweets (~0% of the dataset) do not contain tweet language info.


Unnamed: 0_level_0,Count
Language,Unnamed: 1_level_1
en,344745
hi,61384
bn,23695
und,11191
es,6617
or,4990
fr,3796
in,2428
ja,2248
de,1779


### Source analysis

In [15]:
no_loc_count = df_twt.source.isnull().sum()
print(f"""Please note that {no_loc_count} tweets (~{int(no_loc_count/df_twt.shape[0]*100)}% \
of the dataset) do not contain source info.""")

main_sources = {
    '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>': 'Android',
    '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>': 'Web',
    '<a href="https://mobile.twitter.com" rel="nofollow">Mobile Web (M2)</a>': 'Web',
    '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>': 'iPhone',
    '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>': 'iPad',
}

df_twt['main_sources'] = df_twt['source'].apply(
    lambda x: main_sources[x] 
    if x in main_sources.keys() 
    else 'Others/Unknown'
)

df_twt.groupby('main_sources')\
    .size()\
    .to_frame()\
    .reset_index()\
    .rename(columns={0:'Count', 'main_sources':'Source'})\
    .set_index('Source')\
    .nlargest(20, 'Count')

Please note that 0 tweets (~0% of the dataset) do not contain source info.


Unnamed: 0_level_0,Count
Source,Unnamed: 1_level_1
Android,304622
Web,70627
iPhone,52497
Others/Unknown,42913
iPad,3760


### Location over time

In [17]:
pd.options.mode.chained_assignment = None

top_locations_list = top_locations.iloc[:8]\
    .index.tolist()

df_places = df_twt[['tweet_created_at', 'location']]
df_places.loc[:,'created_at'] = pd.to_datetime(df_places.tweet_created_at).dt.date
df_places.loc[:,'location'] = df_places['location'].apply(
    lambda x: x 
    if x in top_locations_list
    else 'Others/Unknown'
)

df_places = df_places\
    .drop(columns='tweet_created_at')\
    .groupby(['created_at', 'location']).size()\
    .to_frame().reset_index()\
    .rename(columns={0:'Counts', 'location':'Location', 'created_at': 'Date'})\
    .sort_index()

sns.set(style="darkgrid", context="talk")
sns.lineplot(
    data=df_places, hue='Country', x='Date', y='Counts'
).legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontsize='small'  
)


plt.show()

ValueError: Could not interpret input 'Country'

In [None]:
pd.options.mode.chained_assignment = None

top_locations_list = top_locations.iloc[:8]\
    .index.tolist()

df_places = df_twt[['tweet_created_at', 'location']]
df_places.loc[:,'created_at'] = pd.to_datetime(df_places.tweet_created_at).dt.date
df_places.loc[:,'derived.locations.country'] = df_places['location'].apply(
    lambda x: x 
    if x in top_locations_list
    else 'Others/Unknown'
)

df_places = df_places[~df_places['location'].isin(['Others/Unknown', 'India'])]

df_places = df_places\
    .drop(columns='tweet_created_at')\
    .groupby(['created_at', 'location']).size()\
    .to_frame().reset_index()\
    .rename(columns={0:'Counts', 'location':'Location', 'created_at': 'Date'})\
    .sort_index()

sns.set(style="darkgrid", context="talk")
sns.lineplot(
    data=df_places, hue='Country', x='Date', y='Counts'
).legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontsize='small'  
)


plt.show()

### Based on User Location

In [None]:
pd.options.mode.chained_assignment = None

top_locations_list = top_locations.iloc[:8]\
    .index.tolist()

df_places = df_twt[['tweet_created_at', 'users_derived_country']]
df_places.loc[:,'created_at'] = pd.to_datetime(df_places.tweet_created_at).dt.date
df_places.loc[:,'users_derived_country'] = df_places['users_derived_country'].apply(
    lambda x: x 
    if x in top_locations_list
    else 'Others/Unknown'
)

df_places = df_places[~df_places['users_derived_country'].isin(['Others/Unknown', 'India'])]

df_places = df_places\
    .drop(columns='tweet_created_at')\
    .groupby(['created_at', 'users_derived_country']).size()\
    .to_frame().reset_index()\
    .rename(columns={0:'Counts', 'users_derived_country':'Country', 'created_at': 'Date'})\
    .sort_index()

sns.set(style="darkgrid", context="talk")
sns.lineplot(
    data=df_places, hue='Country', x='Date', y='Counts'
).legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontsize='small'  
)


plt.show()

___
## User Analysis
### How many unique users? Verified and Protected Users Analysis

In [None]:
print(f"""
Number of unique users: {df_users.drop_duplicates(subset='user_id').shape[0]}
Number of verified users: {df_users.verified.sum()}
Number of protected users: {df_users.protected.sum()}
""")