# Exploratory Analysis

What are people talking about?

In [22]:
from utils import DTYPE, PARSE_DATES, PROV_CONSOLIDATION, CONSOLIDATED_PROVINCES, CONVERTERS, ANCHOR_NAMES, PROVINCE_COLOR_MAP
from tqdm.auto import tqdm
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
import glob
tqdm.pandas()


prov_map = lambda x : x if x not in PROV_CONSOLIDATION else PROV_CONSOLIDATION[x]

total_df = pd.read_csv("../data/processed_data/total_tweet_dataset.csv",header=0,dtype=DTYPE,converters=CONVERTERS,parse_dates=PARSE_DATES)
total_df = total_df.set_index("id").sort_values("created_at")[~total_df.index.duplicated()]

total_df["created_at"] = total_df["created_at"].dt.to_period("D").dt.to_timestamp('s')
total_df["province"] = total_df["province"].apply(prov_map)
total_df = total_df[total_df.clean_text.notnull()]
total_df["province"] = total_df["province"].apply(prov_map)
total_df = total_df[total_df["province"].isin(CONSOLIDATED_PROVINCES)]
print(len(total_df))
total_df.head()

  from pandas import Panel


401898


Unnamed: 0_level_0,created_at,screen_name,source,clean_text,original_text,is_retweet,favorite_count,retweet_count,city,province,longitude,latitude,cluster,cluster_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1228700383482126337,2020-02-15,Dale32497473,Twitter for Android,tyrant behaves people afraid drop kid school s...,Only a tyrant behaves this way. When people ar...,True,0,0,Quebec,Quebec,-71.21454,46.81228,7,Overflow
1228701333127036934,2020-02-15,DanWKearney,Twitter for iPhone,beachglassfan shawnwilson1975 bluejay19758259 ...,@BeachGlassFan @ShawnWilson1975 @BlueJay197582...,False,1,1,Halifax,Atlantic Province,-63.57239,44.64533,6,School Closures
1228703017807032321,2020-02-15,DanWKearney,Twitter for iPhone,shawnwilson1975 bluejay19758259 itsfortodat hr...,@ShawnWilson1975 @BlueJay19758259 @itsfortodat...,False,0,0,Halifax,Atlantic Province,-63.57239,44.64533,6,School Closures
1228710813596356608,2020-02-15,TwistedEgo2026,Twitter for iPhone,work hard give child best child worry die hand...,WP: I work hard to give my children the best I...,True,0,0,Windsor,Ontario,-83.01654,42.30008,3,Remote Work
1228713336788811776,2020-02-15,NorthShoreNews,dlvr.it,school district hope open childcare centre tea...,#WestVan school district hopes to open childca...,False,41,11,Vancouver,British Columbia,-123.11934,49.24966,2,Childcare


In [23]:
from kaleido.scopes.plotly import PlotlyScope
scope = PlotlyScope()
vis_args = {
    "template": "simple_white",
    "font":{"size": 23},
    "width": 1000
}

## How many and where?

what does the conversation look like?

In [None]:
from utils import PROVINCE_COLOR_MAP
import plotly.express as px

min_d,max_d = str(total_df["created_at"].min()).split()[0],str(total_df["created_at"].max()).split()[0]
print(min_d,max_d)
# counts = total_df[total_df["province"].isin(PROVINCES)]
# counts = total_df[total_df["province"].isin(PROVINCES)]
counts = counts.groupby(["province","created_at"])['screen_name'].count().reset_index().sort_values(["created_at","province"]).reset_index(drop=True)
counts = counts.rename({"screen_name": 'count'},axis=1)
counts = counts.pivot(index='created_at', columns='province', values='count').fillna(0)
counts["Total"] = counts.sum(axis=1)

counts = counts[["Total"]+PROVINCES].reset_index()
counts = pd.melt(counts, id_vars=['created_at'], value_vars=["Total"]+PROVINCES).sort_values(["created_at","province"])
args = {
    "x": "created_at",
    "y": "value",
    "color": "province",
    "line_shape": "spline",
    "template": "simple_white",
    "labels": {"value": "# Tweets", "created_at": "Date", "province": "Province"},
    "color_discrete_map": PROVINCE_COLOR_MAP
}

fig1 = px.line(counts,**args)
fig1.update_layout(**vis_args)
fig1.show()

In [None]:
n_unique = total_df[total_df["province"].isin(PROVINCES)]
n_unique = n_unique.groupby(["province","created_at"])['screen_name'].nunique().reset_index().sort_values(["created_at","province"]).reset_index(drop=True)
n_unique = n_unique.rename({"screen_name": 'n_unique'},axis=1)
n_unique = n_unique.pivot(index='created_at', columns='province', values='n_unique').fillna(0)
n_unique["Total"] = n_unique.sum(axis=1)
n_unique = n_unique[["Total"]+PROVINCES].reset_index()
n_unique = pd.melt(n_unique, id_vars=['created_at'], value_vars=["Total"]+PROVINCES).sort_values(["created_at","province"])

args = {
    "x": "created_at",
    "y": "value",
    "color": "province",
    "line_shape": "vh",
    "title":  "Unique Twitter Users Over Time",
    "template": "simple_white",
    "labels": {"value": "# Unique Twitter Users", "created_at": "Date", "province": "Province"},
    "color_discrete_map": PROVINCE_COLOR_MAP
}

fig2 = px.line(n_unique,**args)
fig2.update_layout(**vis_args)
fig2.show()

## Population Breakdowns

`source:` https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1710000901

Q: Are tweets about covid proportional to provincial populations?

In [25]:
import plotly.graph_objects as go

pop_dist = pd.read_csv("../data/external_datasets/canada_population_data.csv")

tweet_dist = total_df[["province"] & total_df["province"].isin(PROVINCES)]
tweet_dist = tweet_dist.groupby("province").size().reset_index()

tweet_counts = tweet_dist.sort_values("province")[0]
tweet_counts = tweet_counts / tweet_counts.sum()

pop_counts = pop_dist.sort_values("name")["pop2019"]
pop_counts = pop_counts / pop_counts.sum()

fig = go.Figure(data=[
    go.Bar(name='# Tweets', x=PROVINCES, y=tweet_counts),
    go.Bar(name='Population', x=PROVINCES, y=pop_counts)
])

fig.update_layout(barmode='group',height=1000,**vis_args)
fig.update_yaxes(dict(tickformat=',.0%'))
# fig.write_html("../visualizations/exploratory_analysis/pop_tweet_perc_bar_chart.html")
with open("../visualizations/exploratory_analysis/pop_tweet_perc_bar_chart.pdf", "wb") as f:
    f.write(scope.transform(fig, format="pdf"))
fig.show()


In [24]:
from utils import PROV_CONSOLIDATION, CONSOLIDATED_PROVINCES

prov_map = lambda x : x if x not in PROV_CONSOLIDATION else PROV_CONSOLIDATION[x]
total_df["province"] = total_df["province"].apply(prov_map)

In [31]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

counts = total_df[total_df["province"].isin(CONSOLIDATED_PROVINCES)]
counts = total_df[total_df["province"].isin(CONSOLIDATED_PROVINCES)]
counts = counts.groupby(["province",pd.Grouper(key='created_at', freq='W-MON')])['screen_name'].count().reset_index().sort_values(["created_at","province"]).reset_index(drop=True)
counts = counts.rename({"screen_name": 'count'},axis=1)
counts = counts.pivot(index='created_at', columns='province', values='count').fillna(0)
counts["Total"] = counts.sum(axis=1)
counts = counts[["Total"]+CONSOLIDATED_PROVINCES].reset_index()
counts = pd.melt(counts, id_vars=['created_at'], value_vars=["Total"]+CONSOLIDATED_PROVINCES).sort_values(["created_at","province"])

n_unique = total_df[total_df["province"].isin(CONSOLIDATED_PROVINCES)]
n_unique = n_unique.groupby(["province",pd.Grouper(key='created_at', freq='W-MON')])['screen_name'].nunique().reset_index().sort_values(["created_at","province"]).reset_index(drop=True)
n_unique = n_unique.rename({"screen_name": 'n_unique'},axis=1)
n_unique = n_unique.pivot(index='created_at', columns='province', values='n_unique').fillna(0)
n_unique["Total"] = n_unique.sum(axis=1)
n_unique = n_unique[["Total"]+CONSOLIDATED_PROVINCES].reset_index()
n_unique = pd.melt(n_unique, id_vars=['created_at'], value_vars=["Total"]+CONSOLIDATED_PROVINCES).sort_values(["created_at","province"])

fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=("Tweets Over Time",
                    "Unique Twitter Users Over Time"))

for prov in ["Total"]+CONSOLIDATED_PROVINCES:
    count = counts[counts["province"] == prov]
    unique = n_unique[n_unique["province"] == prov]
    fig.add_trace(go.Scatter(x=count["created_at"],
                             y=count["value"],
                             marker=dict(color=PROVINCE_COLOR_MAP[prov]),                         
                             mode="lines",
                             line_shape="spline",
                             line=dict(width=4),                            
                             name=prov,
                             legendgroup=prov
                            ),
                  row=1,
                  col=1)


    fig.add_trace(go.Scatter(x=unique["created_at"],
                             y=unique["value"],
                             marker=dict(color=PROVINCE_COLOR_MAP[prov]),
                             mode="lines",
                             line_shape="spline",
                             name=prov,
                             line=dict(width=4),
                             legendgroup=prov,
                            showlegend = False
                            ),
                  row=2,
                  col=1)
fig.update_xaxes(title_text="Date", row=2, col=1)

fig.update_yaxes(title_text="Tweets", row=1, col=1)
fig.update_yaxes(title_text="Unique Twitter Users",row=2, col=1)

extra_args = {"legend": {"orientation":"h",
              "yanchor":"bottom",
                "y":1.07,
                "xanchor":"right",
                "x":1},
             "height": 700,
              
             }
fig.update_layout(**vis_args,**extra_args)
fig.update_yaxes(type="log") 
fp = f"../visualizations/exploratory_analysis/{min_d}_{max_d}-activity_over_time"
fig.write_html(f"{fp}.html")
with open(f"{fp}.pdf", "wb") as f:
    f.write(scope.transform(fig, format="pdf"))
fig.show()

## Tweets as a function of COVID Cases

Source: https://open.canada.ca/data/en/dataset/b8d1d622-1ceb-4c1c-96e9-a0b38939080b

In [33]:
from datetime import datetime
case_data = pd.read_csv("../data/external_datasets/canada_covid19.csv").rename({"prname":"province"},axis=1)
case_data["province"] = case_data["province"].apply(prov_map)
case_data["date"] = case_data["date"].map(lambda x : datetime.strptime(x,"%d-%m-%Y"))
case_data = case_data[case_data["province"].isin(CONSOLIDATED_PROVINCES)]
case_data = case_data.groupby(["date","province"])['numtotal_last14'].sum().fillna(0).reset_index().sort_values(["date","province"])
fig = px.line(case_data,
              x="date",
              y="numtotal_last14",
              color="province",
              line_shape="vh",
              color_discrete_map=PROVINCE_COLOR_MAP)
fig.update_layout(title_text="COVID-19 Cases Over Time",
                  height=600,
                  template="simple_white")


In [41]:
scatter = pd.merge(case_data, counts,  how='inner', left_on=['date','province'], right_on = ['created_at','province']).sort_values(["date","province"])
args = {
    "x": "numtotal_last14",
    "y": "value",
    "color": "province",
    "labels": {"numtotal_last14": "COVID-19 Cases (14 Day Window)", "value": "# Tweets", "province": "Province"},
    "color_discrete_map": PROVINCE_COLOR_MAP
}


fig = px.scatter(scatter,**args)
extra_args = {"height": 520,
              "legend": {"orientation":"h",
                         "yanchor":"bottom",
                         "y":1.07,
                         "xanchor":"right",
                         "x":1
                        }
             }

fig.update_layout(**extra_args,**vis_args)
fp = f"../visualizations/exploratory_analysis/scatter-actvitiy_covid_cases"
fig.write_html(f"{fp}.html")
with open(f"{fp}.pdf", "wb") as f:
    f.write(scope.transform(fig, format="pdf"))
fig.show()


In [43]:
from sklearn.linear_model import LinearRegression

def r2(x,y):
    x = x.to_numpy(dtype=float).reshape(-1,1)
    y = y.to_numpy(dtype=float)
    # Fit a linear regression to our model and its results
    reg = LinearRegression().fit(x, y) 
    return reg.score(x,y)

print("Overall") 
r = r2(scatter["numtotal_last14"],scatter["value"])
print(f"\t* r^2={r:.4f}")

for prov in CONSOLIDATED_PROVINCES:
    iso = scatter[scatter["province"] == prov]
    print(prov)
    r = r2(iso["numtotal_last14"],iso["value"])
    print(f"\t* r^2={r:.4f}")

Overall
	* r^2=0.0016
Alberta
	* r^2=0.0726
British Columbia
	* r^2=0.3149
Manitoba
	* r^2=0.3782
Ontario
	* r^2=0.1890
Quebec
	* r^2=0.0913
Saskatchewan
	* r^2=0.1412
Atlantic Province
	* r^2=0.1808


## Gegraphic Heatmap

Make use of that longitude and latitude

In [44]:
import matplotlib.pyplot as plt
import numpy as np
import json
import folium
from folium import plugins
%matplotlib inline

loc_data = total_df[total_df["province"].isin(PROVINCES)].groupby(["longitude","latitude"]).count()[["city"]].rename({"city": "count"},axis=1)
loc_data['color']=loc_data['count'].apply(lambda count:"Black" if count>=400 else
                                         "green" if count>=300 and count<400 else
                                         "Orange" if count>=200 and count<300 else
                                         "darkblue" if count>=150 and count<200 else
                                         "red" if count>=100 and count<150 else
                                         "lightblue" if count>=75 and count<100 else
                                         "brown" if count>=50 and count<75 else
                                         "grey")
loc_data['size']=loc_data['count'].apply(lambda count:12 if count>=400 else
                                         10 if count>=300 and count<400 else
                                         8 if count>=200 and count<300 else
                                         6 if count>=150 and count<200 else
                                         5 if count>=100 and count<150 else
                                         4 if count>=75 and count<100 else
                                         3 if count>=50 and count<75 else
                                         2)
loc_data = loc_data.reset_index()
loc_data

Unnamed: 0,longitude,latitude,count,color,size
0,-130.320120,54.316140,38,grey,2
1,-128.950071,54.993627,2,grey,2
2,-128.664774,54.425372,1,grey,2
3,-128.653420,54.052440,7,grey,2
4,-128.603450,54.516340,11,grey,2
...,...,...,...,...,...
769,-71.214540,46.812280,5800,Black,12
770,-71.177930,46.803260,1,grey,2
771,-70.716670,46.666670,15,grey,2
772,-69.542430,47.826990,5,grey,2


In [45]:
m=folium.Map([60,-85],zoom_start=4)
#loc_data=loc_data[0:2000]
for lon,lat,color,count,size in zip(loc_data['longitude'],loc_data['latitude'],loc_data['color'],loc_data['count'],loc_data['size']):
     folium.CircleMarker([lat, lon],
                            radius=size,
                            color='b',
                            fill=True,
                            fill_opacity=0.7,
                            fill_color='red',
                           ).add_to(m)

m.save("../visualizations/exploratory_analysis/tweet_heatmap.html")
m