In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns

In [11]:
df = pd.read_csv("Trends/db_0.csv")
df["date"]= pd.to_datetime(df["date"])
df.dropna(inplace=True)

In [12]:
# Keep country and Facebook
df = df[["country", "Facebook"]]

In [13]:
# Get a list of countries
countries = df["country"].unique()

In [14]:
# Create a new DataFrame 
new_df = pd.DataFrame()
for country in countries:
    tmp = df[df["country"] == country]#
    tmp = tmp.drop(columns=["country"])
    tmp = tmp.rename(columns={"Facebook": country})
    new_df = pd.concat([new_df.reset_index(drop=True), tmp.reset_index(drop=True)], axis=1)

In [15]:
# Remove Global, and insert it at the front
new_df["Global"] = new_df.sum(axis=1) / new_df.shape[1]
GLOB = new_df.pop('Global')
new_df.insert(0, "Global", GLOB)

In [16]:
VIZUALIZE = False

if VIZUALIZE:
    plt.figure(figsize=(60, 40), dpi=300)
    plt.title("Facebook")
    sns.heatmap(new_df.corr(), annot=True)

In [17]:
CORRELATION_MATRIX = new_df.corr()

In [18]:
# Use CORRELATION_MATRIX to whatever you want
print(CORRELATION_MATRIX.columns)
print(CORRELATION_MATRIX.values)


Index(['Global', 'Afghanistan', 'China', 'United Kingdom', 'Colombia',
       'Cameroon', 'Croatia', 'Sweden', 'Morocco', 'Bangladesh', 'Italy',
       'Ecuador', 'Egypt', 'Turkey', 'Ukraine', 'United States', 'Finland',
       'Israel', 'Denmark', 'Brazil', 'Chile', 'Iran', 'Georgia', 'Germany',
       'Bolivia', 'Greece', 'Canada', 'Saudi Arabia', 'Uruguay', 'Gambia',
       'Thailand', 'Hong Kong', 'Hungary', 'Iraq', 'Iceland', 'Switzerland',
       'India', 'Belgium', 'France', 'Estonia', 'Lebanon', 'Cuba', 'Russia',
       'Bosnia & Herzegovina', 'Belarus', 'Montenegro', 'Liechtenstein',
       'South Korea', 'Japan', 'Spain', 'Libya', 'Tunisia', 'Ireland',
       'Lithuania', 'Luxembourg', 'Austria', 'Venezuela', 'South Africa',
       'Mexico', 'Czechia', 'Latvia', 'Kenya', 'Mongolia', 'Norway',
       'Paraguay', 'Netherlands', 'New Zealand', 'Nigeria', 'Slovakia',
       'Slovenia', 'Pakistan', 'Argentina', 'Peru', 'Qatar', 'Serbia',
       'Romania', 'Australia', 'Singapore',

In [25]:
def get_corr_matrix(filename,category):
    '''
    filename: Trends/filename (remember .csv). ex: "db_0.csv"
    category: ex "Facebook", "YouTube"
    returns list of all countries involved (all names of columns/rows) and correlation matrix 
    '''
    df = pd.read_csv(f"Trends/{filename}")
    df["date"]= pd.to_datetime(df["date"])
    df.dropna(inplace=True)
    
    # Keep country and Facebook
    df = df[["country", category]]
    
    # Get a list of countries
    countries = df["country"].unique()
    
    # Create a new DataFrame 
    new_df = pd.DataFrame()
    for country in countries:
        tmp = df[df["country"] == country]#
        tmp = tmp.drop(columns=["country"])
        tmp = tmp.rename(columns={category: country})
        new_df = pd.concat([new_df.reset_index(drop=True), tmp.reset_index(drop=True)], axis=1)
    
    # Remove Global, and insert it at the front
    new_df["Global"] = new_df.sum(axis=1) / new_df.shape[1]
    GLOB = new_df.pop('Global')
    new_df.insert(0, "Global", GLOB)
        
    CORRELATION_MATRIX = new_df.corr()
    
    return CORRELATION_MATRIX.columns, CORRELATION_MATRIX.values
    
cnt,mtr = get_corr_matrix("db_0.csv","Facebook")

Index(['Global', 'Afghanistan', 'China', 'United Kingdom', 'Colombia',
       'Cameroon', 'Croatia', 'Sweden', 'Morocco', 'Bangladesh', 'Italy',
       'Ecuador', 'Egypt', 'Turkey', 'Ukraine', 'United States', 'Finland',
       'Israel', 'Denmark', 'Brazil', 'Chile', 'Iran', 'Georgia', 'Germany',
       'Bolivia', 'Greece', 'Canada', 'Saudi Arabia', 'Uruguay', 'Gambia',
       'Thailand', 'Hong Kong', 'Hungary', 'Iraq', 'Iceland', 'Switzerland',
       'India', 'Belgium', 'France', 'Estonia', 'Lebanon', 'Cuba', 'Russia',
       'Bosnia & Herzegovina', 'Belarus', 'Montenegro', 'Liechtenstein',
       'South Korea', 'Japan', 'Spain', 'Libya', 'Tunisia', 'Ireland',
       'Lithuania', 'Luxembourg', 'Austria', 'Venezuela', 'South Africa',
       'Mexico', 'Czechia', 'Latvia', 'Kenya', 'Mongolia', 'Norway',
       'Paraguay', 'Netherlands', 'New Zealand', 'Nigeria', 'Slovakia',
       'Slovenia', 'Pakistan', 'Argentina', 'Peru', 'Qatar', 'Serbia',
       'Romania', 'Australia', 'Singapore',

In [40]:
# get categories for each file
def get_categories(filenames):
    '''
    filenames: list of filenames
    returns categories as a 2D-list where each
    inner list refers to a filename
    '''
    categories = []
    for filename in filenames:
        df = pd.read_csv(f"Trends/{filename}")
        df["date"]= pd.to_datetime(df["date"])
        df.dropna(inplace=True)
        categories.append(df.columns[2:])
    return categories

In [45]:
# prepare to get corr matrices
filenames = []
for i in range(10):
    filenames += [f"db_{i}.csv"]

categories = get_categories(filenames)
print(filenames,categories)


['db_0.csv', 'db_1.csv', 'db_2.csv', 'db_3.csv', 'db_4.csv', 'db_5.csv', 'db_6.csv', 'db_7.csv', 'db_8.csv', 'db_9.csv'] [Index(['Facebook', 'YouTube', 'Instagram', 'Twitter', 'Snapchat'], dtype='object'), Index(['Elon Musk', 'Jeff Bezos', 'Bill Gates', 'Warren Buffett',
       'Mark Zuckerberg'],
      dtype='object'), Index(['Osama Bin Laden', 'Donald Trump', 'Brexit', 'ISIS', 'al-Qaida'], dtype='object'), Index(['Big Data', 'Artificial Intelligence', 'Augmented Reality',
       'Internet of Things', 'Cloud Computing'],
      dtype='object'), Index(['Depression', 'Therapy', 'Bullying', 'Stress', 'Mental Health'], dtype='object'), Index(['Ebola', 'Cholera', 'Zika virus', 'Measles', 'Malaria'], dtype='object'), Index(['Climate Change', 'Global Warming', 'Pollution', 'Fossil Fuels',
       'Renewable Energy'],
      dtype='object'), Index(['Trauma', 'Racism', 'Loneliness', 'Violence', 'Anxiety'], dtype='object'), Index(['Job', 'Education', 'Freedom', 'Income', 'Investing'], dtype='objec

In [None]:
# I'll store correlation matrices as 2D lists containing
# [category,countries,matrix]
# Takes less than 2 min to run
corr_matrices = []
for i in range(len(filenames)):
    for cat in categories[i]:
        countries, corr_matrix = get_corr_matrix(filenames[i],cat)
        corr_matrices.append([cat,countries,corr_matrix])

corr_matrices