In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# Create dataframe from scratch using a dictionary
temp_dict = {"Fruit":["Apple","Banana"],
            "Drinks":["Fanta","Coke"]}
print (temp_dict)
pd.DataFrame(temp_dict)

In [None]:
# Create a df using list
a = [1,2,3,4,5]
b = ['a','e','i']
pd.DataFrame([a,b])
temp = pd.DataFrame([a,b]).T
temp.columns =["first",'second']
temp

In [None]:
#Import Titanic file (csv) as Pandas Dataframe
df =  pd.read_csv("titanic.csv")
df.head()

In [None]:
df.shape

In [None]:
#Let's check data types and see if this makes sense. If not, we will convert it to their proper type.
df.dtypes

In [None]:
df['PassengerId'] = df['PassengerId'].astype('category')
df['Survived'] = df['Survived'].astype('category')
df['Pclass'] = df['Pclass'].astype('category')
df.dtypes

In [None]:
## Let us check if there are any empty values (nulls)
df.isnull().sum()

In [None]:
# Age has 177 nulls! Out of how many? 
# What to do with this? Delete? Or fill the gap with some statistics? Let's fill null with average age!
avg_age = int(df.Age.mean())
print (avg_age)

In [None]:
# Ok, lets create a new column and fill with avg
df['age_av'] = df['Age']
df['age_av'] = df['age_av'].fillna(avg_age)
df['age_av'].isnull().sum()

In [None]:
bins = range(0,90,10)
plt.hist(df['age_av'], bins= bins, color = 'g', rwidth = 0.8)
plt.hist(df['Age'], bins = bins, color = 'red', alpha = 0.5)
plt.show()

In [None]:
# Next thing, we can forward fill. I.e forward last good value to nulls
df['age_fill'] = df['Age']
df['age_fill'] = df['age_fill'].fillna(method ='ffill')

In [None]:
plt.hist(df['age_fill'], bins= bins, color = 'g', rwidth = 0.8)
plt.hist(df['Age'], bins = bins, color = 'red', alpha = 0.5)
plt.show()

In [None]:
# Next thing, we can interpolate
df['age_lin'] = df['Age']
df['age_lin'] = df['age_lin'].interpolate(method ='linear')

In [None]:
plt.hist(df['age_lin'], bins= bins, color = 'g', rwidth = 0.8)
plt.hist(df['Age'], bins = bins, color = 'red', alpha = 0.5)
plt.show()

In [None]:
df.loc[df.Age.isnull()][['Age',"age_lin","age_av","age_fill"]]

In [None]:
df[['Age',"age_lin","age_av","age_fill"]].head(10)

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.corr()

In [None]:
import seaborn as sns
sns.heatmap(df[['Age',"Fare","SibSp","Parch"]].corr(), annot = True)

In [None]:
# Groupby
df['Survived'].value_counts()

In [None]:
#How many males survived?
df.loc[df.Sex == 'male'].Survived.value_counts()

In [None]:
df.groupby(['Sex','Survived'])['Survived'].count() # sum(), mean()

In [None]:
#HW1:
# How many different Pclasses are there?
# Of those different Pclasses, do male occupy more or female?
# Is there any relationship between Pclass and whether they survived?
# Is there any relationship between Age and whether they survived?
# Who paid more in average? Male or female?
# 'Cabin' has too many null values. Remove that column from dataframe. 

# Handling DateTime Using Pandas

In [None]:
url='https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv'
data = pd.read_csv(url,sep=",")
data.dtypes

In [None]:
data.head()

In [None]:
# to explicitly convert the date column to type DATETIME
data['Date'] = pd.to_datetime(data['Date'])
data.dtypes

In [None]:
# Set datetime as index
data = data.set_index('Date')
data.index

In [None]:
# Look at freq. Currently it is set to None.
#What this means is that it is not known if data is collected by the hour,
#by day, my minute, etc.
# Now let's set the frequency to Daily.
data_freq = data.asfreq('D')
data_freq

In [None]:
data_freq.index

In [None]:
## If there were missing dates, the above code inserts nan. 
## So, let us do some imputation. 
data.shape, data_freq.shape

In [None]:
data_freq.isnull().sum()

In [None]:
data_freq = data.asfreq('D', method = 'ffill')
data_freq.isnull().sum()

In [None]:
## Resample Weekly
data_columns = ['Consumption', 'Wind', 'Solar', 'Wind+Solar']
data_weekly_mean = data[data_columns].resample('W').mean() # W stands for weekly
data_weekly_mean

In [None]:
# Rolling average ( Doing 7 day-rolling aveerage)
data_columns = ['Consumption', 'Wind', 'Solar', 'Wind+Solar']
data_7d_rol = data[data_columns].rolling(window = 7, center = True).mean()
data_7d_rol

In [None]:
data_365d_rol = data[data_columns].rolling(window = 365, center = True).mean()

In [None]:
plt.figure(figsize=[15,5])
plt.plot(data['Consumption'], marker='.', linestyle="None")
plt.plot(data_7d_rol['Consumption'], linewidth=2)
plt.plot(data_365d_rol['Consumption'],linewidth=3)
plt.show()

In [None]:
## Splitting date
import datetime
data['Year'] = data.index.year
data['Month'] = data.index.month

In [None]:
data.head()