# Pure data processing:

- Merge 4 datasets
- Create columns for every keyword
- Take the "unemployment Google searches" keyword, remove the first 2 rows of unemployment, slide the rest of the data to fill that holes  


In [1]:
import pandas as pd

In [2]:
df_political=pd.read_csv("./input/dashboard_spanish_news_political.csv.gz",compression='gzip', header=0, quotechar='"', error_bad_lines=False)
df_political.sort_values(by=["Date"],inplace=True)

df_economical=pd.read_csv("./input/dashboard_spanish_news_economical.csv.gz",compression='gzip', header=0, quotechar='"', error_bad_lines=False)
df_economical.sort_values(by=["Date"],inplace=True)

df_social=pd.read_csv("./input/dashboard_spanish_news_social.csv.gz",compression='gzip', header=0, quotechar='"', error_bad_lines=False)
df_social.sort_values(by=["Date"],inplace=True)

df_google=pd.read_csv("./input/data_pytrends.csv")
df_google.sort_values(by=["date"],inplace=True)

## Dataset structures:

- Requested data from BigQuery (df_political/social/economical): Every different keyword in **political** will be a column with its associated sentiment
- The same for google searches: Every different keyword will be a column with its associated trend_index.
- All df will be resampled and grouped by date to have a weekly display with the corresponding average of **sentiment** or **trend_index**

In [3]:
df_political.head()

Unnamed: 0,political,Date,Sentiment
15773,juicio,2019-01-01,0.18
52050,seguridad_nacional,2019-01-01,-6.55
52051,seguridad_nacional,2019-01-01,-1.43
52052,seguridad_nacional,2019-01-01,-1.43
53246,inestabilidad_politica,2019-01-01,-0.3


In [4]:
# google searches
df_google.columns

Index(['Unnamed: 0', 'keyword', 'date', 'trend_index'], dtype='object')

In [5]:
df_google.drop(columns='Unnamed: 0', inplace=True)
#df_google.rename(columns={'date':"Date"}, inplace=True)
df_google.head()

Unnamed: 0,keyword,date,trend_index
0,zoom,2019-01-06,5
1410,bildu,2019-01-06,3
6204,uber eats,2019-01-06,10
4982,medico,2019-01-06,89
2632,productividad,2019-01-06,39


## Google Dataset: 

- Creating a column for each keyword with the trend_index value

In [6]:
# new dataframe
df_google_dates=pd.DataFrame()

# creating the Date column in new dataset
df_google_dates["date"]=list(set(df_google["date"]))
df_google_dates.sort_values(by=["date"],inplace=True)
df_google_dates.head()

Unnamed: 0,date
81,2019-01-06
85,2019-01-13
84,2019-01-20
40,2019-01-27
20,2019-02-03


In [7]:
# Creating the new columns. Trend index with the name of the corresponding keyword
keyword_list=list(set(df_google["keyword"]))
keyword_list.sort()
for k in keyword_list:
    df_google_dates[k]=df_google[(df_google['keyword'] == k)]["trend_index"].tolist()

In [8]:
df_google_dates.head()

Unnamed: 0,date,amazon,autonomo,ayuda alquiler,badi,banco alimentos,barometro,bildu,bullying,cabify,...,taxi,teletrabajo,tinder,uber,uber eats,videoconferencia,videollamada,vox,yoga,zoom
81,2019-01-06,59,47,38,38,5,32,3,24,20,...,42,2,56,27,10,2,2,35,50,5
85,2019-01-13,50,44,32,51,12,28,4,27,30,...,49,1,51,35,15,3,1,27,44,4
84,2019-01-20,44,41,20,38,13,31,2,25,100,...,100,2,50,100,9,3,2,16,41,3
40,2019-01-27,46,51,19,45,6,44,4,30,68,...,85,2,52,78,11,3,3,11,49,4
20,2019-02-03,48,47,21,32,7,58,5,24,45,...,67,1,50,40,11,3,2,13,48,4


In [9]:
# lets check it out if it's right
print(list(df_google_dates["zoom"][:10]),
      "<==>",
      df_google[df_google["keyword"]=="zoom"]["trend_index"].tolist()[:10],
      ", allright then"
     )

[5, 4, 3, 4, 4, 4, 4, 4, 3, 3] <==> [5, 4, 3, 4, 4, 4, 4, 4, 3, 3] , allright then


## Sliding "unemployment" column.

- Now, I have to remove the first 2 rows of the keyword "desempleo", and supress that space with the rest of the column, so the last 2 rows will be empty 

In [10]:
# I should perform feature ingineering before doing this, to check what's really going on

#desempleo_list=list(df_google_dates["desempleo"])

# delete first 0 positions and add empty ones at the end (not the most elegant)
#desempleo_list.pop(0)
#desempleo_list.pop(1)
#desempleo_list.append(0)
#desempleo_list.append(0)

# add to the dataset
#df_google_dates["desempleo"]=desempleo_list

# ok, it works
#df_google_dates[["Date","desempleo"]]

# Manipulating datasets with Spanish news and sentiment.

We'll need to:

- Create a column for each keyword
- Count occurrences of that keyword
- Measure average sentiment
- Group data by week, starting on monday, to merge with the Google dataset
- Combine occurrences and sentiment into one column representative of both, for each keyword

In [11]:
# let's pplay with the 1st dataset and a random keyword, for instance
df_political[df_political["political"]=="juicio"].head()

Unnamed: 0,political,Date,Sentiment
15773,juicio,2019-01-01,0.18
15774,juicio,2019-01-01,0.18
15784,juicio,2019-01-01,-6.08
15777,juicio,2019-01-01,-4.06
15778,juicio,2019-01-01,-4.06


In [12]:
#df_political.index=df_political["Date"]

- So, I need to measure the average of sentiment of each keyword per day

In [13]:
# df_political[df_political["political"]=="juicio"].groupby("Date").mean()

- Also, counting occurrences of that keyword

In [14]:
# df_political[df_political["political"]=="juicio"].groupby("Date").count()

- Let's use an aggregate to perform both

In [15]:
#df2=df_political[df_political["political"]=="juicio"].groupby(["Date"]).agg(['count','mean'])
# erase multiindex
#df2.columns=df2.columns.droplevel(0)
#df2.head()

- Great, now let's resample by week, starting on Sunday, like the Google Searches dataset

In [16]:
#df2.index = pd.to_datetime(df2.index)

In [17]:
#df2 = df2.resample('W-SUN').mean() #weekly totals

In [18]:
# score is how we are going to measure the keywords
#df2["score"]=df2["count"]*df2["mean"]

In [19]:
#df2.head()

ok, now I know how to do it, then let's continue creating a function to perform this for every keyword in every Spanish news dataset

- I need to create an empty dataframe, 
- loop por ,each keyword from a set of keywords
- perform what i did before for all keywords
- concat to the mepty dataframe
- put all this in a function

In [20]:
def creating_dataset(df,column):
    '''
    Column is the column in which are allocated the keywords, for every case: political, social and economical columns
    '''
    
    # list of new columns
    list_keywords=list(set(df[column]))
    # creating empty dataframe to append info
    df_final=pd.DataFrame()
    df_final["date"]=list(set(df["Date"]))
    

    for k in list_keywords:
        # creating a new dataframe for every keyword in the column, getting the occurrences of keyword and mean of sentiment
        df4=pd.DataFrame()
        df4=df[df[column]==k].groupby(["Date"]).agg(['count','mean'])
        # erase multiindex
        df4.columns=df4.columns.droplevel(0)
        # this will be our score, occurrences * mean 
        df4[k]=df4["count"]*df4["mean"]
        # date column to perform the join by it
        df4["date"]=df4.index
        df4.drop(columns=["count","mean"],inplace=True)
        # this is where we combine the empty dataset, every keyword in its place
        df_final=df_final.merge(df4,how='left', left_on='date', right_on='date')

    # make datetime index for weekly resampling
    df_final["date"]=pd.to_datetime(df_final['date']) 
    df_final.index=df_final["date"]
    # resampling
    df_final = df_final.resample('W-SUN').mean() #weekly totals
    df_final.sort_values(by="date", ascending=True, inplace=True)
    # filling gaps
    df_final=df_final.fillna(0)
    
    return df_final

In [21]:
dfp = creating_dataset(df_political,"political")
dfs = creating_dataset(df_social,"social")
dfe = creating_dataset(df_economical,"economical")

In [22]:
dfp.head()

Unnamed: 0_level_0,seguridad_nacional,juicio,inestabilidad_politica,corrupcion,precio_petroleo,extremismo,refugiados,protestas,rebelion,terrorismo,ejercito,vigilancia
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-01-06,-30.555,-111.88,-7.77,-92.164,0.0,-29.993333,-6.425,-0.986667,-21.453333,-82.14,-20.473333,-31.525
2019-01-13,-5.313333,-485.394286,-23.5,-168.828571,0.0,-2.26,-15.266667,-16.754286,-17.413333,-11.586667,-31.36,-67.15
2019-01-20,-3.22,-232.251429,-36.12,-106.468571,3.72,-5.644,0.0,-98.83,-26.305,-145.26,-23.7,-49.556
2019-01-27,-16.08,-233.122857,-24.146667,-106.342857,0.0,-4.226667,1.276,-41.448571,-26.57,-34.084,-25.508571,-14.53
2019-02-03,-1.48,-173.451429,-5.16,-112.497143,0.0,0.0,-20.31,-35.362857,-28.106667,-24.512,-6.876667,-21.495


In [23]:
dfs.head()

Unnamed: 0_level_0,ciencia,censura_en_medios,subsidios,emergencia_sanitaria,vacunas,enfermedades_muy_infecciosas,emprendimiento,racismo,inmigracion,agresion_sexual,precio_vivienda,energias_renovables
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-01-06,-6.68,0.0,0.0,-88.38,-23.89,-195.31,1.68,0.0,-1.45,-52.646667,0.0,0.0
2019-01-13,-36.077143,0.0,0.0,-44.537143,-11.04,-69.154286,5.3,0.0,0.896667,-109.376,-2.64,0.0
2019-01-20,-7.24,0.0,0.0,-56.36,-5.89,-109.811429,0.0,0.0,3.912,-8.7,0.0,0.0
2019-01-27,-26.273333,0.0,0.0,-28.24,0.0,-101.22,-3.932,0.0,-6.73,-17.14,0.0,0.0
2019-02-03,-8.63,1.74,0.0,-56.934286,0.0,-120.577143,5.64,0.0,-79.54,-164.08,0.34,-2.78


In [24]:
dfe.head()

Unnamed: 0_level_0,libre_comercio,inflacion_economica,desempleo,banco_mundial,job_quality_&_labor_market_performance,stock_market,incertidumbre_economica,crecimiento_economico,prosperidad_economica_y_finanzas,precio_petroleo,pobreza,finanzas_y_bancos,quiebra_economica,macroeconomia_deuda_y_vulnerabilidad
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-01-06,0.0,0.0,-41.995,0.0,0.0,-19.043333,-631.063333,0.0,0.0,0.0,-41.553333,0.0,0.0,-37.705
2019-01-13,-17.82,0.0,-5.933333,0.0,-9.82,-14.948571,-750.242857,0.0,0.0,0.0,-66.84,0.0,0.0,-7.46
2019-01-20,0.0,7.44,-35.166667,0.0,-2.368,-7.531429,-791.151429,0.0,0.0,3.72,-23.345714,0.0,-11.61,-7.76
2019-01-27,0.0,0.0,-14.246667,0.0,-10.792,-16.442857,-770.017143,1.99,0.0,0.0,-12.205714,0.0,0.78,-21.0
2019-02-03,0.0,0.0,-11.875,-0.5,-3.333333,-12.111429,-696.837143,0.0,0.0,0.0,-41.56,0.0,-8.773333,-29.86


# Merging all datasets

In [29]:
from datetime import datetime, date

In [32]:
# creating final dataset with everything
date1 = '2019-01-06'
date2 = datetime.now().date()
mydates = pd.date_range(date1, date2, freq="W").tolist()
df_final=pd.DataFrame()
df_final["date"]=mydates
df_final.head()

Unnamed: 0,date
0,2019-01-06
1,2019-01-13
2,2019-01-20
3,2019-01-27
4,2019-02-03


In [33]:
dfs = [ dfp, dfe, dfs] #df_google_dates,

In [34]:
for d in dfs:
    df_final=df_final.merge(dfe,how='left', left_on='date', right_on='date')
df_final=df_final.fillna(0)

In [35]:
df_final.head()

Unnamed: 0,date,libre_comercio_x,inflacion_economica_x,desempleo_x,banco_mundial_x,job_quality_&_labor_market_performance_x,stock_market_x,incertidumbre_economica_x,crecimiento_economico_x,prosperidad_economica_y_finanzas_x,...,job_quality_&_labor_market_performance,stock_market,incertidumbre_economica,crecimiento_economico,prosperidad_economica_y_finanzas,precio_petroleo,pobreza,finanzas_y_bancos,quiebra_economica,macroeconomia_deuda_y_vulnerabilidad
0,2019-01-06,0.0,0.0,-41.995,0.0,0.0,-19.043333,-631.063333,0.0,0.0,...,0.0,-19.043333,-631.063333,0.0,0.0,0.0,-41.553333,0.0,0.0,-37.705
1,2019-01-13,-17.82,0.0,-5.933333,0.0,-9.82,-14.948571,-750.242857,0.0,0.0,...,-9.82,-14.948571,-750.242857,0.0,0.0,0.0,-66.84,0.0,0.0,-7.46
2,2019-01-20,0.0,7.44,-35.166667,0.0,-2.368,-7.531429,-791.151429,0.0,0.0,...,-2.368,-7.531429,-791.151429,0.0,0.0,3.72,-23.345714,0.0,-11.61,-7.76
3,2019-01-27,0.0,0.0,-14.246667,0.0,-10.792,-16.442857,-770.017143,1.99,0.0,...,-10.792,-16.442857,-770.017143,1.99,0.0,0.0,-12.205714,0.0,0.78,-21.0
4,2019-02-03,0.0,0.0,-11.875,-0.5,-3.333333,-12.111429,-696.837143,0.0,0.0,...,-3.333333,-12.111429,-696.837143,0.0,0.0,0.0,-41.56,0.0,-8.773333,-29.86


In [36]:
df_final.columns

Index(['date', 'libre_comercio_x', 'inflacion_economica_x', 'desempleo_x',
       'banco_mundial_x', 'job_quality_&_labor_market_performance_x',
       'stock_market_x', 'incertidumbre_economica_x',
       'crecimiento_economico_x', 'prosperidad_economica_y_finanzas_x',
       'precio_petroleo_x', 'pobreza_x', 'finanzas_y_bancos_x',
       'quiebra_economica_x', 'macroeconomia_deuda_y_vulnerabilidad_x',
       'libre_comercio_y', 'inflacion_economica_y', 'desempleo_y',
       'banco_mundial_y', 'job_quality_&_labor_market_performance_y',
       'stock_market_y', 'incertidumbre_economica_y',
       'crecimiento_economico_y', 'prosperidad_economica_y_finanzas_y',
       'precio_petroleo_y', 'pobreza_y', 'finanzas_y_bancos_y',
       'quiebra_economica_y', 'macroeconomia_deuda_y_vulnerabilidad_y',
       'libre_comercio', 'inflacion_economica', 'desempleo', 'banco_mundial',
       'job_quality_&_labor_market_performance', 'stock_market',
       'incertidumbre_economica', 'crecimiento_econ

In [37]:
df_final=df_final.merge(df_google_dates,how='left', left_on='date', right_on='date')


ValueError: You are trying to merge on datetime64[ns] and object columns. If you wish to proceed you should use pd.concat