# Exploratory Analysis

What are people talking about re: our data?

In [30]:
import pandas as pd
from dtypes import dtype, parse_dates
import glob

data_paths = glob.glob("../data/processed_data/*.csv")
frames = [pd.read_csv(f,header=0,dtype=dtype,parse_dates=parse_dates) for f in data_paths]

total_df = pd.concat(frames, axis=0, ignore_index=True).set_index("id").sort_values("created_at")
total_df["created_at"] = total_df["created_at"].dt.to_period("D")
print(set(total_df["province"]))
total_df.head()

{nan, 'California', 'Nova Scotia', 'Prince Edward Island', 'Manitoba', 'Maryland', 'Ontario', 'Nunavut', 'Newfoundland and Labrador', 'Alberta', 'Québec', 'São Paulo', 'Saskatchewan', 'Paraíba', 'British Columbia', 'Northwest Territories', 'Victoria', 'Tennessee', 'New Brunswick', 'New York', 'New South Wales', 'England', 'Texas', 'Yukon', 'Quebec'}



Converting to PeriodArray/Index representation will drop timezone information.



Unnamed: 0_level_0,created_at,screen_name,source,clean_text,original_text,is_retweet,favorite_count,retweet_count,hashtags,urls,mentions,city,province,longitude,latitude
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1236442738612207620,2020-03-08,Oreobonix,Twitter for Android,"['indian', 'friend', 'dinner', 'white', 'frien...",An Indian friend just had dinner w a white fri...,True,0,0,,,['sairasameerarao'],Mississauga,Ontario,-79.6583,43.5789
1236444031447597056,2020-03-08,QuiDocet_Discit,Twitter for Android,"['indian', 'friend', 'dinner', 'white', 'frien...",An Indian friend just had dinner w a white fri...,True,0,0,,,['sairasameerarao'],Toronto,Ontario,-79.4163,43.70011
1236446314801446912,2020-03-08,Leliye__,Twitter for iPhone,"['break', 'news', 'grove', 'unified', 'school'...",🚨BREAKING NEWS: Elk Grove Unified School Distr...,True,0,0,,,['MarleiMartinez'],,,,
1236447149661073408,2020-03-08,trinachp_19,Twitter for Android,"['indian', 'friend', 'dinner', 'white', 'frien...",An Indian friend just had dinner w a white fri...,True,0,0,,,['sairasameerarao'],London,Ontario,-81.23304,42.98339
1236451877044322305,2020-03-08,JezkaSeesink,Twitter for Android,"['indian', 'friend', 'dinner', 'white', 'frien...",An Indian friend just had dinner w a white fri...,True,0,0,,,['sairasameerarao'],,,-113.64258,60.10867


In [29]:
import plotly.express as px
counts = total_df.groupby(["created_at","province"]).count()[["screen_name"]]
counts = counts.rename({"screen_name": "count"},axis=1).reset_index()
counts["created_at"] =  counts["created_at"].dt.to_timestamp('s')
counts

args = {
    "x": "created_at",
    "y": "count",
    "color": "province",
    "line_shape": "spline",
    "title":  "Tweets Over Time",
    "template": "simple_white",
    "labels": {"count": "# Tweets", "created_at": "Date", "province": "Province"},
}

fig = px.line(counts,**args)
fig.write_html("../visualizations/exploratory_analysis/sample-tweets_over_time.html")
fig.show()