# Pure data processing:

- Merge 4 datasets
- Create columns for every keyword
- Take the "unemployment Google searches" keyword, remove the first 2 rows of unemployment, slide the rest of the data to fill that holes  


In [1]:
import pandas as pd

In [2]:
df_political=pd.read_csv("./input/dashboard_spanish_news_political.csv.gz",compression='gzip', header=0, quotechar='"', error_bad_lines=False)
df_political.sort_values(by=["Date"],inplace=True)

df_economical=pd.read_csv("./input/dashboard_spanish_news_economical.csv.gz",compression='gzip', header=0, quotechar='"', error_bad_lines=False)
df_economical.sort_values(by=["Date"],inplace=True)

df_social=pd.read_csv("./input/dashboard_spanish_news_social.csv.gz",compression='gzip', header=0, quotechar='"', error_bad_lines=False)
df_social.sort_values(by=["Date"],inplace=True)

df_google=pd.read_csv("./input/data_pytrends.csv")
df_google.sort_values(by=["date"],inplace=True)

## Dataset structures:

- Requested data from BigQuery (df_political/social/economical): Every different keyword in **political** will be a column with its associated sentiment
- The same for google searches: Every different keyword will be a column with its associated trend_index.
- All df will be resampled and grouped by date to have a weekly display with the corresponding average of **sentiment** or **trend_index**

In [3]:
df_political.head()

Unnamed: 0,political,Date,Sentiment
15773,juicio,2019-01-01,0.18
52050,seguridad_nacional,2019-01-01,-6.55
52051,seguridad_nacional,2019-01-01,-1.43
52052,seguridad_nacional,2019-01-01,-1.43
53246,inestabilidad_politica,2019-01-01,-0.3


In [4]:
# google searches
df_google.columns

Index(['Unnamed: 0', 'keyword', 'date', 'trend_index'], dtype='object')

In [5]:
df_google.drop(columns='Unnamed: 0', inplace=True)
df_google.rename(columns={'date':"Date"}, inplace=True)
df_google.head()

Unnamed: 0,keyword,Date,trend_index
0,zoom,2019-01-06,5
1410,bildu,2019-01-06,3
6204,uber eats,2019-01-06,10
4982,medico,2019-01-06,89
2632,productividad,2019-01-06,39


## Google Dataset: 

- Creating a column for each keyword with the trend_index value

In [6]:
# new dataframe
df_google_dates=pd.DataFrame()

# creating the Date column in new dataset
df_google_dates["Date"]=list(set(df_google["Date"]))
df_google_dates.sort_values(by=["Date"],inplace=True)
df_google_dates.head()

Unnamed: 0,Date
32,2019-01-06
28,2019-01-13
83,2019-01-20
7,2019-01-27
90,2019-02-03


In [7]:
# Creating the new columns. Trend index with the name of the corresponding keyword
keyword_list=list(set(df_google["keyword"]))
keyword_list.sort()
for k in keyword_list:
    df_google_dates[k]=df_google[(df_google['keyword'] == k)]["trend_index"].tolist()

In [8]:
df_google_dates.head()

Unnamed: 0,Date,amazon,autonomo,ayuda alquiler,badi,banco alimentos,barometro,bildu,bullying,cabify,...,taxi,teletrabajo,tinder,uber,uber eats,videoconferencia,videollamada,vox,yoga,zoom
32,2019-01-06,59,47,38,38,5,32,3,24,20,...,42,2,56,27,10,2,2,35,50,5
28,2019-01-13,50,44,32,51,12,28,4,27,30,...,49,1,51,35,15,3,1,27,44,4
83,2019-01-20,44,41,20,38,13,31,2,25,100,...,100,2,50,100,9,3,2,16,41,3
7,2019-01-27,46,51,19,45,6,44,4,30,68,...,85,2,52,78,11,3,3,11,49,4
90,2019-02-03,48,47,21,32,7,58,5,24,45,...,67,1,50,40,11,3,2,13,48,4


In [9]:
# lets check it out if it's right
print(list(df_google_dates["zoom"][:10]),
      "<==>",
      df_google[df_google["keyword"]=="zoom"]["trend_index"].tolist()[:10],
      ", allright then"
     )

[5, 4, 3, 4, 4, 4, 4, 4, 3, 3] <==> [5, 4, 3, 4, 4, 4, 4, 4, 3, 3] , allright then


## Sliding "unemployment" column.

- Now, I have to remove the first 2 rows of the keyword "desempleo", and supress that space with the rest of the column, so the last 2 rows will be empty 

In [10]:
# I should perform feature ingineering before doing this, to check what's really going on

#desempleo_list=list(df_google_dates["desempleo"])

# delete first 0 positions and add empty ones at the end (not the most elegant)
#desempleo_list.pop(0)
#desempleo_list.pop(1)
#desempleo_list.append(0)
#desempleo_list.append(0)

# add to the dataset
#df_google_dates["desempleo"]=desempleo_list

# ok, it works
#df_google_dates[["Date","desempleo"]]

# Manipulating datasets with Spanish news and sentiment.

We'll need to:

- Create a column for each keyword
- Count occurrences of that keyword
- Measure average sentiment
- Group data by week, starting on monday, to merge with the Google dataset
- Combine occurrences and sentiment into one column representative of both, for each keyword

In [16]:
# let's pplay with the 1st dataset and a random keyword, for instance
df_political[df_political["political"]=="juicio"].head()

Unnamed: 0,political,Date,Sentiment
15773,juicio,2019-01-01,0.18
15774,juicio,2019-01-01,0.18
15784,juicio,2019-01-01,-6.08
15777,juicio,2019-01-01,-4.06
15778,juicio,2019-01-01,-4.06


- So, I need to measure the average of sentiment of each keyword per day

In [18]:
df_political[df_political["political"]=="juicio"].groupby("Date").mean()

Unnamed: 0_level_0,Sentiment
Date,Unnamed: 1_level_1
2019-01-01,-2.930000
2019-01-02,-3.986667
2019-01-03,-2.350000
2019-01-04,-4.822500
2019-01-05,-4.723333
...,...
2020-10-14,-7.511739
2020-10-15,-6.625128
2020-10-16,-8.926000
2020-10-17,-5.765000


- Also, counting occurrences of that keyword

In [12]:
df_political[df_political["political"]=="juicio"].groupby("Date").count()

Unnamed: 0_level_0,political,Sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,14,14
2019-01-02,18,18
2019-01-03,40,40
2019-01-04,16,16
2019-01-05,60,60
...,...,...
2020-10-14,23,23
2020-10-15,39,39
2020-10-16,5,5
2020-10-17,10,10


- Let's use an aggregate to perform both

In [77]:
df2=df_political[df_political["political"]=="juicio"].groupby(["Date"]).agg(['count','mean'])
# erase multiindex
df2.columns=df2.columns.droplevel(0)
df2.head()

Unnamed: 0_level_0,count,mean
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,14,-2.93
2019-01-02,18,-3.986667
2019-01-03,40,-2.35
2019-01-04,16,-4.8225
2019-01-05,60,-4.723333


- Great, now let's resample by week, starting on Sunday, like the Google Searches dataset

In [80]:
df2.index = pd.to_datetime(df2.index)

In [81]:
df2 = df2.resample('W-SUN').mean() #weekly totals

In [83]:
df2.head()

Unnamed: 0_level_0,count,mean
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-06,27.666667,-4.097824
2019-01-13,68.857143,-6.805291
2019-01-20,36.857143,-6.178493
2019-01-27,37.714286,-5.767773
2019-02-03,36.0,-4.441655


- ok, now I know how to do it, then let's continue creating a function to perform this for every keyword in every Spanish news dataset

In [87]:
- I need to create an empty dataframe, 
- loop por ,each keyword from a set of keywords
- perform what i did before for all keywords
- concat to the mepty dataframe
- put all this in a function

SyntaxError: invalid syntax (<ipython-input-87-212acc1a1690>, line 1)

'hola'