In [None]:
import numpy as np
import os
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,VotingClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,make_scorer
import seaborn as sns
sns.set(color_codes=True)
from scipy import stats
from scipy.stats import norm, skew #for some statistics
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
np.random.seed(25)

In [None]:
flight_train = pd.read_csv('../input/flight_data_train.csv')
weather_train = pd.read_csv('../input/weather_data_train.csv')
weather_test = pd.read_csv('../input/weather_data_test.csv')

In [None]:
flight_train.head()

In [None]:
weather_train.head()

## Feature Engineering

### For Flight data

Let's create target column using total number of flights taken on a day.

In [None]:
# select all columns with totalFlights
columns = flight_train.columns[flight_train.columns.str.endswith('totalFlights')]

# add a new columns TOTAL_FLIGHTS_FOR_ALL_SPOTS for all spots
flight_train['TOTAL_FLIGHTS_FOR_ALL_SPOTS'] = flight_train[columns].sum(axis=1)

# if TOTAL_FLIGHTS_FOR_ALL_SPOTS >= 15 it is a good day(1) otherwise bad day (0)
flight_train['Good_Bad'] = np.where(flight_train['TOTAL_FLIGHTS_FOR_ALL_SPOTS'] >=15, 1, 0)

Let's create some more features.

In [None]:
columns = flight_train.columns[flight_train.columns.str.endswith('totalDistance')]

flight_train['TOTAL_DISTANCE_FOR_ALL_SPOTS'] = flight_train[columns].sum(axis=1)

In [None]:
columns = flight_train.columns[flight_train.columns.str.endswith('maxDistance')]

flight_train['TOTAL_MAX_DISTANCE_FOR_ALL_SPOTS'] = flight_train[columns].sum(axis=1)

In [None]:
flight_train['AVERAGE_DISTANCE'] = flight_train['TOTAL_DISTANCE_FOR_ALL_SPOTS']/288
flight_train['AVERAGE_FLIGHTS'] = flight_train['TOTAL_FLIGHTS_FOR_ALL_SPOTS']/288
flight_train['AVERAGE_MAX_DISTANCE'] = flight_train['TOTAL_MAX_DISTANCE_FOR_ALL_SPOTS']/288

### For weather data

In [None]:
### Train

columns = weather_train.columns[weather_train.columns.str.contains('Pressure')]
weather_train['AVERAGE_PRESSURE'] = weather_train[columns].sum(axis=1) / 45

columns = weather_train.columns[weather_train.columns.str.contains('Temperature')]
weather_train['AVERAGE_TEMPERATURE'] = weather_train[columns].sum(axis=1) / 45

columns = weather_train.columns[weather_train.columns.str.contains('Wind Speed')]
weather_train['AVERAGE_WIND_SPEED'] = weather_train[columns].sum(axis=1) / 45

columns = weather_train.columns[weather_train.columns.str.contains('Wind Direction')]
weather_train['AVERAGE_WIND_DIRECTION'] = weather_train[columns].sum(axis=1) / 45

columns = weather_train.columns[weather_train.columns.str.contains('Dew Point')]
weather_train['AVERAGE_DEW_POINT'] = weather_train[columns].sum(axis=1) / 45

In [None]:
### Train

columns = weather_test.columns[weather_test.columns.str.contains('Pressure')]
weather_test['AVERAGE_PRESSURE'] = weather_test[columns].sum(axis=1) / 45

columns = weather_test.columns[weather_test.columns.str.contains('Temperature')]
weather_test['AVERAGE_TEMPERATURE'] = weather_test[columns].sum(axis=1) / 45

columns = weather_test.columns[weather_test.columns.str.contains('Wind Speed')]
weather_test['AVERAGE_WIND_SPEED'] = weather_test[columns].sum(axis=1) / 45

columns = weather_test.columns[weather_test.columns.str.contains('Wind Direction')]
weather_test['AVERAGE_WIND_DIRECTION'] = weather_test[columns].sum(axis=1) / 45

columns = weather_test.columns[weather_test.columns.str.contains('Dew Point')]
weather_test['AVERAGE_DEW_POINT'] = weather_test[columns].sum(axis=1) / 45

## Merge flight data and weather data

In [None]:
weather_train = pd.merge(weather_train, flight_train[['Day_Id','Good_Bad']], on='Day_Id')

In [None]:
weather_train.head()

In [None]:
# select features for model training
feature_names = [x for x in weather_train.columns if x not in ['Day_Id','Good_Bad']]
target = weather_train['Good_Bad']

In [None]:
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import scale

# X=weather_train[feature_names].values
# X = scale(X)

# Y=weather_test[feature_names].values
# Y = scale(Y)

# pca = PCA(n_components=200)
# pca.fit(X)

# #The amount of variance that each PC explains
# var= pca.explained_variance_ratio_

# print(var)

# #Cumulative Variance explains
# var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

# plt.plot(var1)

In [None]:
# pca = PCA(n_components=200)
# pca.fit(X)
# X_train=pca.fit_transform(X)
# X_test = pca.transform(Y)

# model= CatBoostClassifier(learning_rate=0.1,verbose=False, iterations=500,depth=8)

# ## model training and prediction
# model.fit(X_train, target)
# pred = model.predict(X_test)

In [None]:
model= LogisticRegression()

## model training and prediction
model.fit(weather_train[feature_names], target)
pred = model.predict(weather_test[feature_names])

In [None]:
## make submission
sub = pd.DataFrame()
sub['Day_Id'] = weather_test['Day_Id']
sub['Good_Bad'] = pred.astype(int)
sub.to_csv('result.csv', index=False)

In [None]:
sub.head()