# Cleaning Events data

### Importing necessary libraries

In [None]:
pip install pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

### Loading the data

In [2]:
events = pd.read_csv('../sources/events.csv')
registrados = pd.read_csv('../sources/registered_in_events.csv')

## Data Cleaning

**Cleaning events**

In [None]:
#Useless columns

TO_DROP_EVENTS = ['created_at','updated_at','organization_id']

#Drop useless columns in events

events.drop(TO_DROP_EVENTS, axis=1, inplace=True)

#Drop null columns in events
events.dropna(axis=1, how='all', inplace=True)



**Cleaning registered_in_events**

In [None]:
TO_DROP_REGISTERED = ['id', 'created_at','updated_at','status']

#Drop useless columns in registered_in_events

registered.drop(TO_DROP_REGISTERED, axis=1, inplace=True)

#Drop null columns in registered
registered.dropna(axis=1, how='all', inplace=True)

#Drop null columns in registered
registered.dropna(axis=1, how='all', inplace=True)

**Joining both datasets**

In [None]:
merged = pd.merge(registered, events, left_on="event_id", right_on="id").drop(['id','excerpt','eventbrite_sync_description','eventbrite_url','eventbrite_id','banner'], axis=1)
merged.shape

**Cleaning merged dataframe**

In [None]:
merged['starting_at'] = merged['starting_at'].apply(pd.to_datetime)
merged['ending_at'] = merged['ending_at'].apply(pd.to_datetime)
merged['published_at'] = merged['published_at'].apply(pd.to_datetime)

#Changing format
merged['starting_at'] = merged['starting_at'].dt.strftime('%Y-%m-%d %H:%M:%S')
merged['ending_at'] = merged['ending_at'].dt.strftime('%Y-%m-%d %H:%M:%S')
merged['published_at'] = merged['published_at'].dt.strftime('%Y-%m-%d %H:%M:%S')

#Format change also changed the column type to object, so we need to convert it to datetime again 
merged['starting_at'] = merged['starting_at'].apply(pd.to_datetime)
merged['ending_at'] = merged['ending_at'].apply(pd.to_datetime)
merged['published_at'] = merged['published_at'].apply(pd.to_datetime)

In [None]:
#Replacing commas in certain columns

merged['tags'] = merged['tags'].str.replace(',', ' ')
merged['description'] = merged['description'].str.replace(',', ' ')
merged['title'] = merged['title'].str.replace(',', ' ')

#Replacing nulls with 'undefined'

merged = merged.replace(np.nan, 'Undefined', regex=True)

In [None]:
#Assign language to events with missing info.

merged['lang'] = np.where((merged['event_id'].isin([35,36,38,40,414,37,130,123,41,122,141,42,146,125,145,46,48,47,49,131,
                                                    127,85,86,84,121,128,119,181,189,184,135,134,136,182,192,193,137,138,
                                                    139,186,195,198,217,196,213,203,212,205,204,209,211,218,262,339,268,
                                                    260,263,142,183,140,432,261,363,264,344,340,308,316])),'es', merged['lang'])

merged['lang'] = np.where((merged['event_id'].isin([39,43,187,45,44,190,144,50,51,126,180,191,132,129,120,185,188,197,200,
                                                    194,199,201,202,206,216,208,252,214,251,254,368,357,124,207,210,215])),'en', merged['lang'])

In [None]:
#Saving joined dataset

merged.to_csv('../output/events_and_attendies.csv')

**When joining with form_entries**

In [None]:
final_df = pd.merge(merged, forms[['email','lead_type', 'country','deal_status','won_at']],on='email', how='inner')
final_df.shape