#### Environment Setup

In [40]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb
import seaborn as sns

#### Loading Data

In [41]:
data = pd.read_csv('C:/Users/Manan Arora/Desktop/AgriTech Project/Data Driven Business Metrics Project/FinalSubDistData/CoC-AllCombined_New.csv')
data['chem_cost'] = data[['ferttotal_rs','manure_rs','insecticide_rs']].sum(axis=1)
### Dropping Irrelevant Columns
data_no_rs=  data[data.columns.drop(list(data.filter(regex='_rs')))]

#### Modelling on Paddy Data

In [50]:
data_cott = data_no_rs.loc[data_no_rs['crop'] == 'Paddy']
print('Number of Entries: {}'.format(len(data_cott)))

Number of Entries: 138096


#### Preparing for the modelling

In [51]:
#### Variable Choices
cols_indep = ['N_D', 'OC_D', 'P_D', 'K_D', 'Zn_D', 'Fe_D', 'Cu_D', 'Mn_D', 'B_D','AS%', 'SrAc%', 'HAc%', 
              'MAc%', 'SlAc%', 'N%', 'MAl%', 'SlAl%','Temp', 'Rain']
cols_dep =   ['chem_cost']

In [52]:
#data_cott.loc[:,cols_indep] = data_cott.loc[:, cols_indep].div(data_cott['croparea_ha'], axis=0)
#data_cott = data_cott.dropna()

In [53]:
#sns.pairplot(data_cott,x_vars = cols_indep ,y_vars= cols_dep ,kind='reg')

In [54]:
data_cott.replace([np.inf, -np.inf], np.nan,inplace=True)
data_cott = data_cott.dropna()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [55]:
np.all(np.isfinite(X_train_sc))

True

In [56]:
X=data_cott[cols_indep]
y=data_cott['chem_cost']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
sc=StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform (X_test)

In [57]:
#Linear Regression
lm = LinearRegression(normalize=True)

model = lm.fit(X_train_sc,y_train)

predictions_train=lm.predict(X_train_sc)
predictions = lm.predict(X_test_sc)

df=pd.DataFrame(list(zip(predictions, y_test)), columns =['Pred', 'y']) 
print(df)

score_train=lm.score(X_train_sc,y_train)
accuracy_train=mean_squared_error(y_train,predictions_train)
score_test=lm.score(X_test_sc,y_test)
accuracy_test=mean_squared_error(y_test,predictions)
print('Linear_Regression Score:',score_train, np.sqrt(accuracy_train),score_test, np.sqrt(accuracy_test))
weights = pd.DataFrame(list(zip(cols_indep+['Temp','Rain'],lm.coef_)),columns=['Feature','Coefficient'])
print(weights)

               Pred        y
0       6906.700554   7992.0
1       4828.505331    300.0
2       4149.596638   4610.0
3       -397.604191    979.0
4       4004.950373      0.0
...             ...      ...
33400   7679.030071   3400.0
33401   5279.407132   7190.0
33402   4228.111151   4336.0
33403  10117.389513   7410.0
33404   5483.384972  16380.0

[33405 rows x 2 columns]
Linear_Regression Score: 0.12248546541676751 6040.478994145059 0.12104646242736994 5889.031574871823
   Feature    Coefficient
0      N_D      40.130367
1     OC_D     670.414289
2      P_D    -771.839050
3      K_D    -339.863475
4     Zn_D    -392.563923
5     Fe_D    -192.460529
6     Cu_D     -72.662870
7     Mn_D     187.652413
8      B_D    -278.437082
9      AS%  -59804.435435
10   SrAc% -126129.592847
11    HAc% -332900.824605
12    MAc% -362138.066220
13   SlAc% -247933.664463
14      N% -360576.230424
15    MAl% -583131.436789
16   SlAl%  -70515.357662
17    Temp    -332.903737
18    Rain    -274.425415


In [285]:
lm = ElasticNet(alpha=0.1)

model = lm.fit(X_train_sc,y_train)

predictions_train=lm.predict(X_train_sc)
predictions = lm.predict(X_test_sc)

df=pd.DataFrame(list(zip(predictions, y_test)), columns =['Pred', 'y']) 
print(df[0:10])

score_train=lm.score(X_train_sc,y_train)
accuracy_train=mean_squared_error(y_train,predictions_train)
score_test=lm.score(X_test_sc,y_test)
accuracy_test=mean_squared_error(y_test,predictions)
print('Linear_Regression Score:',score_train, np.sqrt(accuracy_train),score_test, np.sqrt(accuracy_test))
print('Coeficients',lm.coef_)

       Pred         y
0  1.108259  1.224770
1  1.192967  2.140992
2  1.428625  0.826720
3  1.462508  1.377866
4  1.366543  1.271876
5  1.451688  1.491540
6  1.459401  0.757826
7  1.657615  2.755732
8  1.541169  1.785837
9  1.402421  2.204586
Linear_Regression Score: 0.13918061312370544 0.6415934465491413 0.1632145282263242 0.5723935541833631
Coeficients [-0.          0.         -0.0603792  -0.0518024   0.          0.02179469
 -0.          0.          0.         -0.00466376 -0.         -0.02740893
 -0.         -0.         -0.00060839  0.1100677   0.          0.
  0.        ]


In [None]:
#XGboost
xgb_reg=xgb.XGBRegressor()
xgb_model= xgb_reg.fit(X_train_sc,y_train)

predictions_xgb=xgb_model.predict(X_test_sc)

df_xgb=pd.DataFrame(list(zip(predictions_xgb, y_test)), columns =['Pred_xgb', 'y']) 
print(df_xgb[0:20])

rmse= np.sqrt(mean_squared_error(y_test,predictions_xgb))
print(rmse)

In [None]:
reg = svm.SVR(kernel='linear',C=0.3 , epsilon=0.01)
xgb_model= reg.fit(X_train_sc,y_train)
predictions_xgb= reg.predict(X_test_sc)
df_xgb=pd.DataFrame(list(zip(predictions_xgb, y_test)), columns =['Pred_xgb', 'y']) 
print(df_xgb[0:20])
rmse= np.sqrt(mean_squared_error(y_test,predictions_xgb))
print(rmse)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

xgb_model= rf.fit(X_train_sc,y_train)

predictions_xgb= rf.predict(X_test_sc)

df_xgb=pd.DataFrame(list(zip(predictions_xgb, y_test)), columns =['Pred_xgb', 'y']) 
print(df_xgb[0:20])

rmse= np.sqrt(mean_squared_error(y_test,predictions_xgb))
print(rmse)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(random_state=0)

xgb_model= dt.fit(X_train_sc,y_train)

predictions_xgb= dt.predict(X_test_sc)

df_xgb=pd.DataFrame(list(zip(predictions_xgb, y_test)), columns =['Pred_xgb', 'y']) 
print(df_xgb[0:20])

rmse= np.sqrt(mean_squared_error(y_test,predictions_xgb))
print(rmse)