In [46]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

Data Ingestions

In [47]:
path = r'../data/intrim/LiteratureData_20220809.csv'
df_org = pd.read_csv(path, encoding= 'unicode_escape')
df = df_org.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 44 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sr. No.         558 non-null    float64
 1   Year            558 non-null    float64
 2   Author          558 non-null    object 
 3   Exp             558 non-null    float64
 4   Data            558 non-null    float64
 5   Vel             558 non-null    float64
 6   Temp            560 non-null    float64
 7   RH              558 non-null    float64
 8   hours           560 non-null    float64
 9   Fit             558 non-null    float64
 10  Variety         558 non-null    object 
 11  Technique       558 non-null    object 
 12  Pretreatment    558 non-null    object 
 13  P_temp          558 non-null    object 
 14  P_time          558 non-null    object 
 15  kg_r            9 non-null      object 
 16  kg_m            560 non-null    float64
 17  Diff_r          0 non-null      flo

In [49]:
df.columns

Index(['Sr. No.', 'Year', 'Author', 'Exp', 'Data', 'Vel', 'Temp', 'RH',
       'hours', 'Fit', 'Variety', 'Technique', 'Pretreatment', 'P_temp',
       'P_time', 'kg_r', 'kg_m', 'Diff_r', 'Diff_m', 'Do', 'TD', 'alpha',
       'aLR', 'aRL', 'mwR', 'Density', 'Berry Count', 'Radius', 'Dry_Mass',
       'Weight_i', 'Vol_i', 'Water_i', 'MR_i', 'MC_i', 'MC_i.1', 'Weight_f',
       'Vol_f', 'Water_f', 'MC_eq_Lit', 'MC_eq_Lit.1', 'MR_f', 'MC_f',
       'MC_f.1', 'Pretreatment.1'],
      dtype='object')

In [85]:
features_categorical = ['Variety', 'Technique', 'Pretreatment',] 
features_numerical = ['Vel', 'Temp', 'P_temp', 'P_time',] #'RH',]
target = ['hours']

In [86]:
df = df[features_categorical + features_numerical + target]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 558 entries, 0 to 557
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Variety       558 non-null    object 
 1   Technique     558 non-null    object 
 2   Pretreatment  558 non-null    object 
 3   Vel           558 non-null    float64
 4   Temp          558 non-null    float64
 5   P_temp        558 non-null    float64
 6   P_time        558 non-null    float64
 7   hours         558 non-null    float64
dtypes: float64(5), object(3)
memory usage: 39.2+ KB


In [87]:
df['P_temp'].replace(to_replace=dict(Untreated=25.001, NotApplicable = 25.001), inplace=True)
df['P_time'].replace(to_replace=dict(Untreated=0.0001, NotApplicable = 25.001), inplace=True)
df['P_temp'] = df['P_temp'].astype(float)
df['P_time'] = df['P_time'].astype(float)
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 558 entries, 0 to 557
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Variety       558 non-null    object 
 1   Technique     558 non-null    object 
 2   Pretreatment  558 non-null    object 
 3   Vel           558 non-null    float64
 4   Temp          558 non-null    float64
 5   P_temp        558 non-null    float64
 6   P_time        558 non-null    float64
 7   hours         558 non-null    float64
dtypes: float64(5), object(3)
memory usage: 39.2+ KB


In [88]:
X = pd.get_dummies(data=df[features_categorical + features_numerical ],
                   columns=features_categorical).dropna()
y = df[target].dropna()

In [89]:
X.columns

Index(['Vel', 'Temp', 'P_temp', 'P_time', 'Variety_Aledo', 'Variety_Asgari',
       'Variety_Black', 'Variety_Centennial', 'Variety_Chasselas',
       'Variety_Crimson', 'Variety_Delight', 'Variety_Emerald',
       'Variety_Flame', 'Variety_Globe', 'Variety_Italia', 'Variety_Monukka',
       'Variety_Monukka ', 'Variety_Muscatel', 'Variety_Nevado',
       'Variety_Perlette', 'Variety_Red', 'Variety_Rodi', 'Variety_Ruby',
       'Variety_Seeded', 'Variety_Sugraone', 'Variety_Tempranillo',
       'Variety_Thomson', 'Variety_Tunisian', 'Variety_Unreported',
       'Technique_ConMic', 'Technique_Convective', 'Technique_Dark',
       'Technique_Fluidized bed', 'Technique_MicroWave', 'Technique_Open Sun',
       'Technique_Shade', 'Pretreatment_Chemical',
       'Pretreatment_Chemical + Microwave', 'Pretreatment_Cryogenic',
       'Pretreatment_Electric', 'Pretreatment_Freezing',
       'Pretreatment_Hot Air', 'Pretreatment_Microwave', 'Pretreatment_Ohmic',
       'Pretreatment_Physical', 'P

Trian Test Split

In [93]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=0)

Base Model

In [94]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
reg_lin = model.fit(X_train, y_train)
y_pred = reg_lin.predict(X_test)

from sklearn.metrics import r2_score
print("Training accuracy: {:.4f}".format(r2_score(y_train, reg_lin.predict(X_train))))
print("Testing accuracy:  {:.4f}".format(r2_score(y_test , reg_lin.predict(X_test))))


Training accuracy: 0.7472
Testing accuracy:  0.2733


Random forrest 

In [104]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth=10,
                              n_estimators=10,
                              criterion='absolute_error')
reg_rf = model.fit(X_train, y_train)
y_pred = reg_rf.predict(X_test)

from sklearn.metrics import r2_score
print("Training accuracy: {:.4f}".format(r2_score(y_train, reg_rf.predict(X_train))))
print("Testing accuracy:  {:.4f}".format(r2_score(y_test , reg_rf.predict(X_test))))

Training accuracy: 0.8986
Testing accuracy:  0.7224
