### **Importing Libraries**


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRFRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
import pickle
import warnings
warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


In [2]:
from google.colab import drive
drive.mount("/content/drive")

KeyboardInterrupt: ignored

### **Read & Understand Data**

In [None]:
train = pd.read_csv("/content/drive/My Drive/HackersEarth/train.csv")
train.head()

In [None]:
test = pd.read_csv("/content/drive/My Drive/HackersEarth/test.csv")
test.head()

In [None]:
print("Train data contains % 2d rows and % 2d columns" %(train.shape[0],train.shape[1]), "\n")
print("Test data contains {} rows and {} columns" .format(test.shape[0],test.shape[1]))

In [None]:
for columns in train.columns:
  if columns not in test.columns:
    print("Column not present in the Test Data is: ", columns)

### **EDA for Train Data**

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.hist(bins=50, figsize=(15, 15))

In [None]:
train.skew()

In [None]:
#print(data_train.isnull().any())
print(train.isnull().sum())

### **EDA for Test Data**

In [None]:
test.info()

In [None]:
test.describe()

In [None]:
test.hist(bins=50, figsize=(15, 15))

In [None]:
test.skew()

In [None]:
#print(data_train.isnull().any())
print(test.isnull().sum())

### **Copying the DataFrame for manipulation**

In [None]:
data_train = train.copy()
data_test = test.copy()

## **Cleaning Data**

In [None]:
# data_train['wind_speed(m/s)'] = np.absolute(data_train['wind_speed(m/s)'])
# data_train['shaft_temperature(°C)']

### **Identifying the Numerical and Categorical Columns**

In [None]:
categoricalCol_train = [column for column in data_train.columns if data_train[column].dtype == object ]
print("Categorical Columns in Train data are: ", categoricalCol_train)

numericalCol_train  = [column for column in data_train.columns if data_train[column].dtype != object]
print("Numerical Columns in Train data are: ", numericalCol_train)

print("------------------------------------------------------------------------------------------")

categoricalCol_test = [column for column in data_test.columns if data_test[column].dtype == object ]
print("Categorical Columns in Test data are: ", categoricalCol_test)

numericalCol_test  = [column for column in data_test.columns if data_test[column].dtype != object]
print("Numerical Columns in Test data are: ", numericalCol_test)

### **Handling Null Values in Numerical data**

In [None]:
for column in numericalCol_train:
  if not column == 'windmill_generated_power(kW/h)':
    data_train[column] = data_train[column].fillna(data_train[column].median())
  else:
    print('Skipping null value imputation for target column')

data_train.isnull().sum()

In [None]:
for column in numericalCol_test:
  if not column == 'windmill_generated_power(kW/h)':
    data_test[column] = data_test[column].fillna(data_test[column].median())
  else:
    print('Skipping null value imputation for target column')

data_test.isnull().sum()

### **Handling Null Values in Categorical Data**

##### **Finding unique value for the categorical columns**

In [None]:
for col in data_train[categoricalCol_train]:
  if not (col == 'tracking_id') and not (col == 'datetime'):
    print("Unique values for: ", col)
    print(data_train[col].unique())
    print("\n")

In [None]:
print('Total null values in Cloud Level Feature: {}'.format(data_train['cloud_level'].isnull().sum()))
print('Total null values in Turnbine Status Feature: {}'.format(data_train['turbine_status'].isnull().sum()))

##### **Replace Null values with MODE**

In [None]:
for column in categoricalCol_train:
  if column == 'tracking_id' or column == 'datetime':
    continue;
  else:
    modeValue = data_train[column].mode()[0]
    print('Mode for {} is: {}'.format(column, modeValue), "\n")
    data_train[column] = data_train[column].fillna(modeValue)
    

data_train.isnull().sum()

In [None]:
for column in categoricalCol_test:
  if column == 'tracking_id' or column == 'datetime':
    continue;
  else:
    modeValue = data_test[column].mode()[0]
    print('Mode for {} is: {}'.format(column, modeValue), "\n")
    data_test[column] = data_test[column].fillna(modeValue)
    

data_test.isnull().sum()

### **Deleting the Null value rows for target feature in Train Data**

In [None]:
data_train = data_train.dropna(how='any',axis=0)
data_train.isnull().any()

In [None]:
data_train.reset_index(drop= True, inplace=True)
print(data_train.shape, "\n")
data_train.head()

### **Removing Duplicated values, as it won't give any insight**

In [None]:
data_train.duplicated().any()

In [None]:
print("Shape before deleting duplicates: ", data_train.shape, "\n")
data_train.drop_duplicates(keep=False, inplace= True)
print("Shape after deleting duplicates: ", data_train.shape)

In [None]:
data_test.duplicated().any()

### **Converting Categorical Data to Numerical Data**

In [None]:
data_train = pd.get_dummies(data_train, columns=['turbine_status','cloud_level'])
data_train.head(1)

In [None]:
data_test = pd.get_dummies(data_test, columns=['turbine_status','cloud_level'])
data_test.head(1)

### **Converting date to datetime format splitting its value according to Date, Day and year**



In [None]:
data_train['datetimeNew'] = pd.to_datetime(data_train['datetime'])
data_train.head(1)

In [None]:
data_test['datetimeNew'] = pd.to_datetime(data_test['datetime'])

In [None]:
data_train['month'] = data_train['datetimeNew'].dt.month
data_train['day'] = data_train['datetimeNew'].dt.day
data_train['year'] = data_train['datetimeNew'].dt.year
# data_train['hour'] = data_train['datetimeNew'].dt.hour
# data_train['minute'] = data_train['datetimeNew'].dt.minute
data_train.drop(['datetimeNew'], axis=1, inplace= True)
data_train.head(1)

In [None]:
data_test['month'] = data_test['datetimeNew'].dt.month
data_test['day'] = data_test['datetimeNew'].dt.day
data_test['year'] = data_test['datetimeNew'].dt.year
# data_test['hour'] = data_test['datetimeNew'].dt.hour
# data_test['minute'] = data_test['datetimeNew'].dt.minute
data_test.drop(['datetimeNew'], axis=1, inplace= True)
data_test.head(1)

### **Handling Skewness of the Data**




```
wind_speed(m/s)                   1.680262
atmospheric_temperature(°C)      -1.674895
shaft_temperature(°C)            -2.525168
engine_temperature(°C)           -3.944776
windmill_body_temperature(°C)    -2.236832
rotor_torque(N-m)                -1.030947
blade_length(m)                  -8.608358
```



In [None]:
skewedColumns = ['atmospheric_temperature(°C)', 'shaft_temperature(°C)', 'engine_temperature(°C)', 'windmill_body_temperature(°C)', 'rotor_torque(N-m)', 'blade_length(m)']
power = PowerTransformer(method='yeo-johnson', standardize=True) 
for column in skewedColumns:
  data_train[column] = power.fit_transform(data_train[column].values.reshape(-1,1))

data_train.head()

In [None]:
for column in skewedColumns:
  data_test[column] = power.fit_transform(data_test[column].values.reshape(-1,1))

data_test.head()

### **Handling Outliers**

In [None]:
for column in numericalCol_train:
 plt.figure()
 data_train.boxplot([column])

In [None]:
def outlier_treatment(datacolumn):
  sorted(datacolumn)
  Q1,Q3 = np.nanpercentile(datacolumn , [25,75])
  IQR = Q3 - Q1
  lower_range = Q1 - (1.5 * IQR)
  upper_range = Q3 + (1.5 * IQR)
  print("Lower bound: ", lower_range, "Upper bound: ", upper_range)
  if ((datacolumn < lower_range).any() or (datacolumn > upper_range).any()):
    outliers = (datacolumn < lower_range).sum() + (datacolumn > upper_range).sum()
    print(outliers, " No of Outliers present: ", "\n")
  else:
    print("No Outliers Detected", "\n")
  
  return lower_range,upper_range

In [None]:
# Running loop over the Data Frame with Numeric (Continuous) Values
lowerbound = []
upperbound = []
for column in numericalCol_train:
  print("Outlier check for column: ",column)
  lowerbound_column, upperbound_column = outlier_treatment(data_train[column])
  lowerbound.append(lowerbound_column)
  upperbound.append(upperbound_column)

In [None]:
# from scipy import stats
# z = np.abs(stats.zscore(data_train))
# print(z)

### **Separating features and labels**

In [None]:
newDataFrame = data_train.drop(['tracking_id','datetime','motor_torque(N-m)','windmill_generated_power(kW/h)'], axis=1)

In [None]:
features = newDataFrame.iloc[:]
label = data_train['windmill_generated_power(kW/h)']

In [None]:
newDataFrameTest = data_test.drop(['tracking_id','datetime','motor_torque(N-m)'], axis=1)
featuresTest = newDataFrameTest.iloc[:].values

### **Feature Engineering**

In [None]:
 plt.figure(figsize=(30,20))
sns.heatmap(data_train.corr(),annot=True,cmap='BuGn_r',fmt='.3f')

In [None]:
vif = pd.DataFrame()
def calc_vif(X):
 # Calculating VIF
 vif["variables"] = X.columns
 vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
 return(vif)
X =  newDataFrame
calc_vif(X)
vif_high = vif.sort_values(by = 'VIF', ascending=False)
vif_high

In [None]:
# 1. Initialize the model algorithm
from sklearn.linear_model import LinearRegression

modelLR = LinearRegression()

# 2. Apply RFE to model (ALL FEATURES AND LABEL)
from sklearn.feature_selection import RFE
selectFeaturesFromRFE = RFE(estimator=modelLR,step=1)

# Fit the data with RFE
selectFeaturesFromRFE.fit(features,label)

# 3. Get Features with High Ranking (1,2,3,4,...) (Get features that has Rank 1. Sometimes Rank 2 is considered)
print(selectFeaturesFromRFE.ranking_)


 'atmospheric_temperature(°C)','shaft_temperature(°C)', 'engine_temperature(°C)', 'area_temperature(°C)', 'windmill_body_temperature(°C)', 'rotor_torque(N-m)',
'blade_breadth(m)', 'turbine_status_A', 'turbine_status_A2', 'turbine_status_AAA', 'turbine_status_B2', 'turbine_status_BB', 'turbine_status_BCB', 'cloud_level_Extremely Low','cloud_level_Low', 'cloud_level_Medium', 'month',  'year'

In [None]:
# Initialize the model algorithm
from sklearn.linear_model import LinearRegression
modelLR = LinearRegression()
# 2. Apply SBM to model (ALL FEATURES AND LABEL)
from sklearn.feature_selection import SelectFromModel
selectFeaturesFromSFM = SelectFromModel(modelLR)
# Fit the data with SFM
selectFeaturesFromSFM.fit(features,label)
# 3. Get Features with True value
print(selectFeaturesFromSFM.get_support())


'atmospheric_temperature(°C)','engine_temperature(°C)', 'blade_breadth(m)', 'cloud_level_Extremely Low', 'cloud_level_Low', 'cloud_level_Medium', 'year'

In [None]:
selectedFeatures = newDataFrame.loc[:,['atmospheric_temperature(°C)','engine_temperature(°C)', 'generator_temperature(°C)', 'blade_breadth(m)','rotor_torque(N-m)','shaft_temperature(°C)', 'windmill_body_temperature(°C)',  'cloud_level_Extremely Low','cloud_level_Low', 'cloud_level_Medium']]

### **APPLYING STANDARD SCALER**

In [None]:
#initialize scalar
standardScaler = StandardScaler()
selectedFeatures = standardScaler.fit_transform(selectedFeatures)
features = standardScaler.fit_transform(features)
featuresTest = standardScaler.fit_transform(featuresTest)

### **Building the Model**

#### **Train Test Split with Feature Engineered Data**

In [None]:
x_train,x_test,y_train,y_test = train_test_split(selectedFeatures,label,train_size=0.8,random_state=42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

#### **Train Test Split with whole data**

##### **Finding optimum no random state**

In [None]:
for i in range(1,10):
  X_train,X_test,y_train,y_test = train_test_split(features, label, test_size=0.2, random_state = i)
  model1 = RandomForestRegressor()
  model1.fit(X_train,y_train)
  
  train_score = model1.score(X_train,y_train)
  test_score = model1.score(X_test,y_test)
  #if (test_score > 0.95):
  print("Test: {} , Train: {} , RS : {}".format(test_score,train_score,i))


In [None]:
x_train,x_test,y_train,y_test = train_test_split(features,label,train_size=0.8,random_state=2)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

#### **Building model**

In [None]:
def ModelTypes():
  modelType = []
  modelType.append(('LinearRegression'   , make_pipeline(StandardScaler(), LinearRegression())))
  modelType.append(('Lasso'  ,make_pipeline(StandardScaler(), Lasso())))
  modelType.append(('Ridge', make_pipeline(StandardScaler(), Ridge(alpha=1.0))))
  modelType.append(('ElasticNet'  , make_pipeline(StandardScaler(), ElasticNet())))
  #modelType.append(('SVR' , make_pipeline(StandardScaler(), SVR())))
  modelType.append(('KNN'   , make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=2))))
  modelType.append(('ExtraTreesRegressor', make_pipeline(StandardScaler(), ExtraTreesRegressor(n_jobs=-1, min_samples_leaf=1, max_depth=20, min_samples_split=3, n_estimators=1000))))
  modelType.append(('DecisionTree'  , make_pipeline(StandardScaler(), DecisionTreeRegressor())))
  modelType.append(('RandomForest'   , make_pipeline(StandardScaler(), RandomForestRegressor(n_jobs=-1))))
  modelType.append(('XGBRF'  , make_pipeline(StandardScaler(), XGBRFRegressor(n_jobs=-1, silent=True))))
  modelType.append(('GradientBoostingRegressor', make_pipeline(StandardScaler(), GradientBoostingRegressor(criterion='mse',random_state=2,max_depth=5,n_estimators=500,min_samples_split=2,min_samples_leaf=2))))
    
  return modelType

In [None]:
def ModelBuilding(X_train, y_train, models):
  num_folds = 10
  scoring = 'neg_mean_squared_error'
  SEED = 2
  modelScoreDict = {}
  for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=SEED)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold)
    modelScoreDict[name] = cv_results
    scores = "{}: {} ({})" .format(name, cv_results.mean(), cv_results.std())
    print(scores)

  return modelScoreDict

In [None]:
models = ModelTypes()
modelScoreDict = ModelBuilding(x_train, y_train, models)

In [None]:
gbr = GradientBoostingRegressor()
rf = RandomForestRegressor(n_jobs=-1)

In [None]:
%%time

modelFinal = VotingRegressor([('gbr', gbr),('rf',rf)],n_jobs=-1)
modelFinal.fit(features, label)

y_test_pre = modelFinal.predict(featuresTest)
y_train_pre = modelFinal.predict(features)
r2_train = r2_score(label, y_train_pre)
rmse_train  = np.sqrt(mean_squared_error(label, y_train_pre))
print("-----Training Data Evalution-----")
print("R2 Value: ", r2_train)
print("RMSE: ", rmse_train)

### **Predicting the Value from Test Data**

In [None]:
predictedValue = modelFinal.predict(featuresTest)

In [None]:
print("The length of the predicted vlue is: {}".format(len(predictedValue)), "\n")
print(predictedValue)

In [None]:
finalDataFrame = test.loc[:,['tracking_id','datetime']]
finalDataFrame['windmill_generated_power(kW/h)'] = predictedValue 

In [None]:
finalDataFrame

In [None]:
finalDataFrame.to_csv('/content/drive/My Drive/HackersEarth/predictionDataMixed.csv', header=True, index=False)