**Importing the necessary libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error , r2_score

**Reading** **data** **bold text**

In [None]:
HPC = pd.read_csv('02 Household Power Consumption.txt',sep = ';')

HPC.shape  # Checking the shape (185711, 9)



(185711, 9)

**Checking first 5 columns**

In [None]:
HPC.head(5)  

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


**Cleaning data**


In [None]:
  
    
df_obj = HPC.select_dtypes(['object'])   #Making object that selects only strings

HPC[df_obj.columns] = df_obj.apply(lambda x: x.str.strip()) #lambda to strip strings

HPC = HPC.replace(dict.fromkeys(['','?'], np.nan)) #replacing missing data with nans

# Making sure that python knows the exact datatypes to be able to impute clean

for i in HPC.columns[2:]: # Excluding date and time
    HPC.loc[:, i].astype(float) # Defining the format


In [None]:
HPC[df_obj.columns]

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2
0,16/12/2006,17:24:00,4.216,0.418,234.840,18.400,0.000,1.000
1,16/12/2006,17:25:00,5.360,0.436,233.630,23.000,0.000,1.000
2,16/12/2006,17:26:00,5.374,0.498,233.290,23.000,0.000,2.000
3,16/12/2006,17:27:00,5.388,0.502,233.740,23.000,0.000,1.000
4,16/12/2006,17:28:00,3.666,0.528,235.680,15.800,0.000,1.000
...,...,...,...,...,...,...,...,...
185706,24/4/2007,16:30:00,0.320,0.082,235.190,1.600,0.000,2.000
185707,24/4/2007,16:31:00,0.328,0.078,234.530,1.600,0.000,1.000
185708,24/4/2007,16:32:00,0.344,0.088,234.900,1.800,0.000,1.000
185709,24/4/2007,16:33:00,0.416,0.174,235.290,2.000,0.000,1.000


**Check number of nans and remove them**

In [None]:
print(HPC.isnull().sum().sum())

HPC = HPC.drop(['Date','Time'],axis=1) # We wont use time series

imputer = SimpleImputer(strategy='mean')  # Replacing nans with mean
HPC = pd.DataFrame(imputer.fit_transform(HPC)) # Fitting

print(HPC.isnull().sum().sum()) # Make sure no more nans 




0


KeyError: ignored

**Using polynomial features**

In [None]:
poly_reg = PolynomialFeatures(degree = 2)
HPC_Data = poly_reg.fit_transform(HPC.iloc[:,1:])
HPC_Data = pd.DataFrame(HPC_Data)
print(HPC_Data.shape)
HPC_Data.head(4) 

(185711, 28)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1.0,0.418,234.84,18.4,0.0,1.0,17.0,0.174724,98.16312,7.6912,...,338.56,0.0,18.4,312.8,0.0,0.0,0.0,1.0,17.0,289.0
1,1.0,0.436,233.63,23.0,0.0,1.0,16.0,0.190096,101.86268,10.028,...,529.0,0.0,23.0,368.0,0.0,0.0,0.0,1.0,16.0,256.0
2,1.0,0.498,233.29,23.0,0.0,2.0,17.0,0.248004,116.17842,11.454,...,529.0,0.0,46.0,391.0,0.0,0.0,0.0,4.0,34.0,289.0
3,1.0,0.502,233.74,23.0,0.0,1.0,17.0,0.252004,117.33748,11.546,...,529.0,0.0,23.0,391.0,0.0,0.0,0.0,1.0,17.0,289.0


**Correlation**

In [None]:
HPC_Data.corr()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,,,,,,,,,,,...,,,,,,,,,,
1,,1.0,-0.155888,0.290593,0.133704,0.181552,0.104612,0.897268,0.999756,0.723293,...,0.259655,0.144139,0.155087,0.208146,0.12268,0.087662,0.103004,0.114654,0.143115,0.100593
2,,-0.155888,1.0,-0.580983,-0.24382,-0.228056,-0.387751,-0.166372,-0.139684,-0.415117,...,-0.531633,-0.261509,-0.253474,-0.511183,-0.224904,-0.139746,-0.231057,-0.202296,-0.228702,-0.367976
3,,0.290593,-0.580983,1.0,0.437476,0.460201,0.575285,0.304257,0.280215,0.691892,...,0.920553,0.464852,0.482129,0.811789,0.412152,0.26004,0.399933,0.411635,0.441231,0.562844
4,,0.133704,-0.24382,0.437476,1.0,0.069794,0.120963,0.155686,0.128296,0.365827,...,0.504809,0.957695,0.110493,0.321459,0.976142,0.351137,0.826977,0.053545,0.062068,0.113917
5,,0.181552,-0.228056,0.460201,0.069794,1.0,0.121031,0.176463,0.176975,0.396623,...,0.537495,0.110842,0.942436,0.342398,0.058675,0.308518,0.058626,0.928828,0.828349,0.114588
6,,0.104612,-0.387751,0.575285,0.120963,0.121031,1.0,0.11445,0.09871,0.327151,...,0.405977,0.126672,0.135826,0.832335,0.104796,0.044159,0.197625,0.102257,0.230657,0.998693
7,,0.897268,-0.166372,0.304257,0.155686,0.176463,0.11445,1.0,0.894291,0.781277,...,0.287421,0.169019,0.162542,0.223932,0.142782,0.106521,0.115253,0.115487,0.144645,0.110268
8,,0.999756,-0.139684,0.280215,0.128296,0.176975,0.09871,0.894291,1.0,0.712685,...,0.248797,0.137932,0.149432,0.19897,0.11766,0.083669,0.098,0.11051,0.138272,0.09506
9,,0.723293,-0.415117,0.691892,0.365827,0.396623,0.327151,0.781277,0.712685,1.0,...,0.700085,0.410112,0.419029,0.560426,0.345321,0.271628,0.316851,0.328992,0.373138,0.316817


**Define x , y**

In [None]:
X = HPC_Data.iloc[:,:]
y = HPC.iloc[:,0]

0         4.216
1         5.360
2         5.374
3         5.388
4         3.666
          ...  
185706    0.320
185707    0.328
185708    0.344
185709    0.416
185710    0.408
Name: 0, Length: 185711, dtype: float64

**Using standard deviation as our data scaler**

In [None]:
Scaler = StandardScaler()
X_scaled = Scaler.fit_transform(X)
y_scaled = Scaler.fit_transform(y.values.reshape(-1,1)) # Because scaling needs 2d array
X_scaled = pd.DataFrame(X_scaled) 
y_scaled = pd.Series(y_scaled.reshape(-1)) # Because series needs 1d array XD

**Splitting data into train and test**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled,
                                                    test_size=0.20, random_state=44,
                                                    shuffle =True)

**Choosed linear regression model to predict the Global active power**

In [None]:
Model = LinearRegression()
Model.fit(X_train, y_train)

LinearRegression()

**Some predictions**

In [None]:
y_pred = Model.predict(X_test)
print(list(y_test[:5]))
print(list(y_pred[:5]))

[-0.9020410682003588, -0.6422266720364048, 2.453894882250714, 0.5996242810567799, -0.7504826704380524]
[-0.9287526105967222, -0.6036472751103751, 2.4224171665026963, 0.6754517715090618, -0.7357913899033437]



**Crossvalidation scores**




In [None]:
CrossValidateScoreTrain = cross_val_score(Model, X_train, y_train, cv=3)
CrossValidateScoreTest = cross_val_score(Model, X_test, y_test, cv=3)
print(CrossValidateScoreTrain)
print('//////////////// ')
print(CrossValidateScoreTest)

[0.99877114 0.99881087 0.99878909]
//////////////// 
[0.99892647 0.99884227 0.99879404]


**Calculating Mean Squared Error**

In [None]:
MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average') # it can be raw_values
print('Mean Squared Error Value is : ', MSEValue)

Mean Squared Error Value is :  0.0011234151245614718


**Calculating Accuracy**

In [None]:
AccScore = r2_score(y_test, y_pred)
print("Accuracy is %.2f" %(AccScore*100) + '%')

**Model Visualization**

In [None]:
plt.scatter(X_test, y_test, color = 'green')
plt.plot(X_train, Model.predict(X_train), color = 'blue')
plt.title('Power consumption')
plt.xlabel('Power Consumption')
plt.ylabel('General Power')
plt.show()

KeyError: ignored