# Import libraries we need for this project

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from time import time

### Specify the file contain our data.

In [None]:
filename ="Weather_Data.csv"

### Read This into a Panda DataFrame

In [None]:
df = pd.read_csv("Weather_Data.csv")

In [None]:
print('Loaded {} records from {}.\n'.format(len(df), filename))

In [None]:
print(df.info())  

### Check that we have loaded the correct data

In [None]:
df.head()

### For Regression we need to convert some categorical data to binary in order for the model to work. In our case RainToday,WindGustDir,WindDir9am,WindDir3pm. We perform one hot encoding 

In [None]:
df_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

## Reconfirm our data is in shape

In [None]:
df_processed.head()

### We also change the RainTommorow Column to binary from "Yes" and "No" for regression

In [None]:
df_processed.replace(['No', 'Yes'], [0,1], inplace=True)

# Prepare our data for Training and Testing

In [None]:
df_processed.drop('Date',axis=1,inplace=True)

In [None]:
df_processed = df_processed.astype(float)

In [None]:
print(df_processed.info()) 

In [None]:
with pd.option_context('float_format', '{:.2f}'.format): 
    print( df_processed.describe() )

## Use histograms to visualize the distribution of various features

In [None]:
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

df.hist(figsize=(20,15));
plt.figure();

## Show correlations with RainTommorow

In [None]:
print('Pearson correlations with RainTommorow')
corr_matrix = df_processed.corr()
corr_matrix['RainTomorrow'].sort_values(ascending=False)

## Analyze cross correlations

In [None]:
# Specify size and title for the visualization
f, axes = plt.subplots(figsize=(20, 20))
plt.title('All Correlations',fontsize=32)

# For the purpose of visualization, we'll use a different order for the features.
# We'll start with RainTommorow, to make it easier to compare all other features with it.
features = ['RainTomorrow','Humidity3pm','Cloud3pm',
            'RainToday_Yes','Cloud9am','MaxTemp','Rainfall','Evaporation','Temp3pm' ]

# Use Seaborn library to plot the correlation matrix as a heatmap
sns.heatmap(df_processed[features].corr(),
           linewidths = 3.0,
           square = True,
           cmap = 'Greens',
           linecolor='w',
           annot=True,
           annot_kws={'size':11},
           cbar_kws={'shrink': .5});


In [None]:
x_values = df_processed.drop(columns='RainTomorrow', axis=1)

In [None]:
y_values = df_processed['RainTomorrow']

# SVM

### We split the data for training and testing set to make it 20% for testing and 80% for Training

In [None]:
x_train, x_test, y_train, y_test =  train_test_split(x_values,y_values , random_state=10,test_size=0.2, shuffle=True)

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel = 'rbf', C = 100, random_state = 1936)
svm.fit(x_train, y_train)

plot_decision_boundary(x_train, y_train, svm, True)

In [None]:
from sklearn.model_selection import GridSearchCV

svm = SVC(gamma = 'auto', random_state = 1936)

grid = [{'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
         'C': [0.01, 0.1, 1, 5, 10, 25, 50, 100]}]

search = GridSearchCV(svm, param_grid = grid, scoring = 'accuracy', cv = 5)
search.fit(x_train, np.ravel(y_train));

print(search.best_params_)