In [172]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

Data Ingestions

In [173]:
path = r'../data/intrim/LiteratureData_20220809.csv'
df_org = pd.read_csv(path, encoding= 'unicode_escape')
df = df_org.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 44 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sr. No.         558 non-null    float64
 1   Year            558 non-null    float64
 2   Author          558 non-null    object 
 3   Exp             558 non-null    float64
 4   Data            558 non-null    float64
 5   Vel             558 non-null    float64
 6   Temp            560 non-null    float64
 7   RH              558 non-null    float64
 8   hours           560 non-null    float64
 9   Fit             558 non-null    float64
 10  Variety         558 non-null    object 
 11  Technique       558 non-null    object 
 12  Pretreatment    558 non-null    object 
 13  P_temp          558 non-null    object 
 14  P_time          558 non-null    object 
 15  kg_r            9 non-null      object 
 16  kg_m            560 non-null    float64
 17  Diff_r          0 non-null      flo

In [174]:
df.columns

Index(['Sr. No.', 'Year', 'Author', 'Exp', 'Data', 'Vel', 'Temp', 'RH',
       'hours', 'Fit', 'Variety', 'Technique', 'Pretreatment', 'P_temp',
       'P_time', 'kg_r', 'kg_m', 'Diff_r', 'Diff_m', 'Do', 'TD', 'alpha',
       'aLR', 'aRL', 'mwR', 'Density', 'Berry Count', 'Radius', 'Dry_Mass',
       'Weight_i', 'Vol_i', 'Water_i', 'MR_i', 'MC_i', 'MC_i.1', 'Weight_f',
       'Vol_f', 'Water_f', 'MC_eq_Lit', 'MC_eq_Lit.1', 'MR_f', 'MC_f',
       'MC_f.1', 'Pretreatment.1'],
      dtype='object')

In [208]:
features_categorical = ['Variety', 'Technique', 'Pretreatment',] 
features_numerical = ['Vel', 'Temp', 'P_temp', 'P_time','RH',]
target = ['hours']

In [209]:
df = df[features_categorical + features_numerical + target]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 558 entries, 0 to 557
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Variety       558 non-null    object 
 1   Technique     558 non-null    object 
 2   Pretreatment  558 non-null    object 
 3   Vel           558 non-null    float64
 4   Temp          558 non-null    float64
 5   P_temp        558 non-null    float64
 6   P_time        558 non-null    float64
 7   RH            558 non-null    float64
 8   hours         558 non-null    float64
dtypes: float64(6), object(3)
memory usage: 59.8+ KB


In [220]:
df['P_temp'].replace(to_replace=dict(Untreated=25.001, NotApplicable = 25.001), inplace=True)
df['P_time'].replace(to_replace=dict(Untreated=0.0001, NotApplicable = 25.001), inplace=True)
df['P_temp'] = df['P_temp'].astype(float)
df['P_time'] = df['P_time'].astype(float)
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 558 entries, 0 to 557
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Variety       558 non-null    object 
 1   Technique     558 non-null    object 
 2   Pretreatment  558 non-null    object 
 3   Vel           558 non-null    float64
 4   Temp          558 non-null    float64
 5   P_temp        558 non-null    float64
 6   P_time        558 non-null    float64
 7   RH            558 non-null    float64
 8   hours         558 non-null    float64
dtypes: float64(6), object(3)
memory usage: 59.8+ KB


In [222]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df[features_categorical + features_numerical])
X = enc.transform(df[features_categorical + features_numerical])
y = df[target].dropna()

In [114]:
X = pd.get_dummies(data=df[features_categorical + features_numerical ],
                   columns=features_categorical).dropna()


Trian Test Split

In [223]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=0)

Base Model

In [224]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
reg_lin = model.fit(X_train, y_train)
y_pred = reg_lin.predict(X_test)

from sklearn.metrics import r2_score
print("Training accuracy: {:.4f}".format(r2_score(y_train, reg_lin.predict(X_train))))
print("Testing accuracy:  {:.4f}".format(r2_score(y_test , reg_lin.predict(X_test))))


Training accuracy: 0.9345
Testing accuracy:  0.6651


Random forrest 

In [225]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth=10,
                              n_estimators=10,
                              criterion='absolute_error')
reg_rf = model.fit(X_train, y_train)
y_pred = reg_rf.predict(X_test)

from sklearn.metrics import r2_score
print("Training accuracy: {:.4f}".format(r2_score(y_train, reg_rf.predict(X_train))))
print("Testing accuracy:  {:.4f}".format(r2_score(y_test , reg_rf.predict(X_test))))

Training accuracy: 0.8558
Testing accuracy:  0.7761


Testing on Payload

In [119]:
df_check = df_org[500:501]
out = df_check.to_json(orient='records')[1:-1].replace('},{', '} {')
with open('../tests/payload.json', 'w') as f:
    f.write(out)

In [226]:
payload = {
      "Sr. No.": 501,
      "Year": 2017,
      "Author": "Azzouz S.",
      "Exp": 1,
      "Data": 18,
      "Vel": 2,
      "Temp": 45,
      "RH": 20,
      
      "Fit": 0.142,
      "Variety": "Thomson",
      "Technique": "Convective",
      "Pretreatment": "Untreated",
      "P_temp": "Untreated",
      "P_time": "Untreated",
      "kg_r": 'null',
      "kg_m": 69,
      "Diff_r": 'null',
      "Diff_m": 0.000306,
      "Do": 750000000,
      "TD": 2483,
      "alpha": 0.0179,
      "aLR": 0.023,
      "aRL": 0.149,
      "mwR": "29.6",
      "Density": 1075,
      "Berry Count": 400,
      "Radius": 0.822,
      "Dry_Mass": 205.778,
      "Weight_i": 1000,
      "Vol_i": 930.233,
      "Water_i": 794.222,
      "MR_i": 1,
      "MC_i": 3.8596,
      "MC_i.1": 0.7942,
      "Weight_f": 289.098,
      "Vol_f": 224.107,
      "Water_f": 83.32,
      "MC_eq_Lit": 0.4049,
      "MC_eq_Lit.1": 0.2882,
      "MR_f": 0,
      "MC_f": 0.4049,
      "MC_f.1": 0.2882,
      "Pretreatment.1": "Untreated"
}

#"hours": 17,

Prediction function

In [253]:
def prediction(data, encoder, model):

    df = pd.DataFrame(data, index=[0])

    if isinstance(df['P_temp'][0], str):
        df['P_temp'][0] = 25
    if isinstance(df['P_time'][0], str):
        df['P_time'][0] = 0.00  
    df['P_temp'] = df['P_temp'].astype(float)
    df['P_time'] = df['P_time'].astype(float) 
          
    X = df[features_categorical + features_numerical]
    X_en = enc.transform(X)
    y_pred = model.predict(X_en)
    return y_pred

Dumping Model and Encoder

In [255]:
import pickle

with open('../bin/reg_rf.bin', 'wb') as f_out:
    pickle.dump( reg_rf , f_out)

with open('../bin/enc.bin', 'wb') as f_out:
    pickle.dump( enc , f_out)