## This is a notebook that make the 'machine learning filling missing data' into a function

In [1]:
# import the need package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [14]:
# making function
def filled_missingdata_with_ML(dataframe):
    
    """
    This is a function that using machine learning in 'polynomial' that in SKLEARN to predict the missing data 
    and will return to a dataframe which has the predicted data and with a new column name 'status' with 0 
    representing that this data is an asumption.
    
    Args:
       dataframe(DataFrame):name of the dataframe which has missing data
       
    Returns:
       dataframe
    
    """
    
    # drop the 'DC capacity' & 'Location' columns cause if will influence the 'isnan' judgement.
    dataframe_with_missing = dataframe.drop(['DC Capacity','Location'],axis=1)
    
    # change the 'month' to the number instead of 'words'
    month_change = {'November':11,'December':12,'January': 1,
                'February':2,'March':3,'April':4,'May':5,
                'June':6,'July':7,'August':8,'September':9,'October':10}
    dataframe_with_missing['Month'] = dataframe_with_missing['Month'].map(month_change)
    
    # get the 'date' that has the missing data
    list_of_missing = []
    index_of_missing = []
    for i in range(len(dataframe_with_missing)):
        if np.isnan(dataframe_with_missing['Energy'][i]) == True:
            index_of_missing.append(i)
            year_month = [dataframe_with_missing['Year'][i],dataframe_with_missing['Month'][i]]
            list_of_missing.append(year_month)
            
    #drop the missing data
    dataframe_without_missing = dataframe_with_missing.dropna()
    dataframe_without_missing.reset_index(drop=True,inplace=True)
    
    #creating the x,y to train the model
    x_list=[]
    for i in range(len(dataframe_without_missing)):
        a = [dataframe_without_missing['Year'][i],dataframe_without_missing['Month'][i]]
        x_list.append(a)

    y_list=[]
    for i in range(len(dataframe_without_missing)):
        b = dataframe_without_missing['Energy'][i]
        y_list.append(b)
    
    # preparing the x_need_predicted for the model
    x_need_predict=list_of_missing
    
    # design the model
    poly = PolynomialFeatures(degree=8)
    
    # Transfer the preditor into poly way
    X_train = poly.fit_transform(x_list)
    x_need_predict_transfer = poly.fit_transform(x_need_predict)
    
    # Instantiate
    lg = LinearRegression()
    
    # Fit
    lg.fit(X_train, y_list)
    
    # Predict
    y_predicted = lg.predict(x_need_predict_transfer)
    
    # put the predicted data back (actually create a new dataframe with predicted data)
    list_of_predicted = y_predicted.tolist()
    fill = pd.DataFrame(index=index_of_missing,data=list_of_predicted,columns=['Energy'])
    dataframe_with_predicted = dataframe.fillna(fill)
    
    # create a new column in dataframe name 'status'
    dataframe_with_predicted['Status'] = ""
    for i in index_of_missing:
        dataframe_with_predicted['Status'][i] = 0
        
    return(dataframe_with_predicted)

In [15]:
# read the missing file
bethel = pd.read_excel('../Cleaned_data/RuralAk/RuralAK_Bethel_cleaned.xlsx')

In [19]:
a = filled_missingdata_with_ML(bethel)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [20]:
a # It is working!!!!!

Unnamed: 0,Year,Month,Energy,DC Capacity,Location,Status
0,2012,October,950.000000,10.0,Bethel,
1,2012,November,739.000000,,,
2,2012,December,370.000000,,,
3,2013,January,329.000000,,,
4,2013,February,730.000000,,,
5,2013,March,1575.000000,,,
6,2013,April,2374.000000,,,
7,2013,May,1902.000000,,,
8,2013,June,1481.000000,,,
9,2013,July,1127.000000,,,
