<a href="https://colab.research.google.com/github/anuppapu/Data_Science/blob/master/Estimation_Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
#to ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Load the data frame with specific columns
specific_columns = ['Total Estimate', 'Parent Product', 'Admin Estimate', 'BA Estimate', 'Dev Estimate',
                     'ProductSupport Estimate', 'QA Estimate', 'Client', 'Project Category']
df = pd.read_csv('/content/Work_Request.csv', usecols=specific_columns, encoding='latin1')
df =  df[df['Project Category'] == 'Client Conversion']


# Create a mapping dictionary
mapping = {'003 - UBS Wealth Management USA': 'tier 1',
           '330 - TD Prime Services': 'tier 2',
           '504 - JP Morgan Chase': 'tier 1',
           '108 - Cowen & Company': 'tier 3',
           '101 - National Bank of Canada': 'tier 3',
           '079 - Santander Investment Securities': 'tier 3',
           '073 - Industrial and Commercial Bank of China (ICBC)': 'tier 1',
           '335 - Bank of Montreal, Security TRA': 'tier 3',
           '084 - BMO Capital Markets Corp.': 'tier 2',
           '303 - HSBC Bank USA, N.A.': 'tier 2',
           '006 - RBC Capital Market Corp': 'tier 2',
           '027 - J.P. Morgan Clearing Corp.': 'tier 1'
           }

# Map the client column values to tier1, tier2, and tier3
df['tier'] = df['Client'].map(mapping)

# Remove Project Category column from data
df = df.drop(['Project Category', 'Client'], axis = 1)

# Remove Duplicates
df.drop_duplicates(inplace=True)

df.fillna(0,inplace=True)

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Convert categorical values to numerical values
df['tier'] = label_encoder.fit_transform(df['tier'])

# Use get_dummies to perform one-hot encoding
one_hot_encoded = pd.get_dummies(df['Parent Product'], drop_first=True)

# Concatenate the one-hot encoded columns with the original dataframe
df = pd.concat([df, one_hot_encoded], axis=1)

# Drop the original categorical column
df = df.drop('Parent Product', axis=1)

# Display the selected columns
df.sample(20)

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.linear_model import LassoLarsIC, OrthogonalMatchingPursuitCV, LinearRegression, Ridge, BayesianRidge, HuberRegressor
from sklearn.metrics import mean_squared_error

# Load the data
#df = pd.read_csv('data.csv')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('Total Estimate', axis=1), df['Total Estimate'], test_size=0.25, random_state=42)

# Create the LassoLarsIC model
lassolarsic = LassoLarsIC()

# Fit the model to the training data
lassolarsic.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lassolarsic.predict(X_test)

# Evaluate the model
mse_lasso = mean_squared_error(y_test, y_pred)
rmse_lasso = np.sqrt(mse_lasso)
accuracy_lasso = lassolarsic.score(X_test, y_test)


# Print the results
print('LassoLarsIC MSE:', mse_lasso)
print('LassoLarsIC RMSE:', rmse_lasso)
#print('LassoLarsIC accuracy:%.2f' % (accuracy_lasso*100))
print('LassoLarsIC accuracy: {:.2f}%'.format(accuracy_lasso*100))
print('-----------------------------------------------')
# Create the OrthogonalMatchingPursuitCV model
omp = OrthogonalMatchingPursuitCV()

# Fit the model to the training data
omp.fit(X_train, y_train)

# Make predictions on the test data
y_pred = omp.predict(X_test)

# Evaluate the model
mse_omp = mean_squared_error(y_test, y_pred)
rmse_omp = np.sqrt(mse_omp)
accuracy_omp = omp.score(X_test, y_test)

# Print the results
print('OrthogonalMatchingPursuitCV MSE:', mse_omp)
print('OrthogonalMatchingPursuitCV RMSE:', rmse_omp)
print('OrthogonalMatchingPursuitCV accuracy:{:.2f}%'.format(accuracy_omp*100))
print('-----------------------------------------------')
# Create the LinearRegression model
linearregression = LinearRegression()

# Fit the model to the training data
linearregression.fit(X_train, y_train)

# Make predictions on the test data
y_pred = linearregression.predict(X_test)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred)
rmse_lr = np.sqrt(mse_lr)
accuracy_lr = linearregression.score(X_test, y_test)

# Print the results
print('LinearRegression MSE:', mse_lr)
print('LinearRegression RMSE:', rmse_lr)
print('LinearRegression accuracy: {:.2f}%'.format(accuracy_lr*100))
print('-----------------------------------------------')
# Create the Ridge model
ridge = Ridge()

# Fit the model to the training data
ridge.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ridge.predict(X_test)

# Evaluate the model
mse_ridge = mean_squared_error(y_test, y_pred)
rmse_ridge = np.sqrt(mse_ridge)
accuracy_ridge = ridge.score(X_test, y_test)

# Print the results
print('Ridge MSE:', mse_ridge)
print('Ridge RMSE:', rmse_ridge)
print('Ridge accuracy: {:.2f}%'.format(accuracy_ridge*100))
print('-----------------------------------------------')
# Create the BayesianRidge model
bayesianridge = BayesianRidge()

# Fit the model to the training data
bayesianridge.fit(X_train, y_train)

# Make predictions on the test data
y_pred = bayesianridge.predict(X_test)

# Evaluate the model
mse_br = mean_squared_error(y_test, y_pred)
rmse_br = np.sqrt(mse_br)
accuracy_br = bayesianridge.score(X_test, y_test)

# Print the results
print('BayesianRidge MSE:', mse_br)
print('BayesianRidge RMSE:', rmse_br)
print('BayesianRidge accuracy: {:.2f}%'.format(accuracy_br*100))
print('-----------------------------------------------')

# Create the HuberRegressor model
huberregressor = HuberRegressor()

# Fit the model to the training data
huberregressor.fit(X_train, y_train)

# Make predictions on the test data
y_pred = huberregressor.predict(X_test)

# Evaluate the model
mse_hr = mean_squared_error(y_test, y_pred)
rmse_hr = np.sqrt(mse_hr)
accuracy_hr = huberregressor.score(X_test, y_test)

# Print the results
print('HuberRegressor MSE:', mse_hr)
print('HuberRegressor RMSE:', rmse_hr)
print('HuberRegressor accuracy:{:.2f}%'.format(accuracy_hr*100))
print('-----------------------------------------------')
# Create the LinearSVR model
linearsvr = LinearSVR()

# Fit the model to the training data
linearsvr.fit(X_train, y_train)

# Make predictions on the test data
y_pred = linearsvr.predict(X_test)

# Evaluate the model
mse_lsvr = mean_squared_error(y_test, y_pred)
rmse_lsvr = np.sqrt(mse_lsvr)
accuracy_lsvr = linearsvr.score(X_test, y_test)

# Print the results
print('LinearSVR MSE:', mse_lsvr)
print('LinearSVR RMSE:', rmse_lsvr)
print('LinearSVR accuracy: {:.2f}%'.format(accuracy_lsvr*100))
print('-----------------------------------------------')
# Create a DataFrame with the results
results = pd.DataFrame({
    'Model': ['LassoLarsIC', 'OrthogonalMatchingPursuitCV', 'LinearRegression', 'Ridge', 'BayesianRidge', 'HuberRegressor', 'LinearSVR'],
    'MSE': [mse_lasso, mse_omp, mse_lr, mse_ridge, mse_br, mse_hr, mse_lsvr],
    'RMSE': [rmse_lasso, rmse_omp, rmse_lr, rmse_ridge, rmse_br, rmse_hr, rmse_lsvr],
    'Accuracy': [accuracy_lasso,accuracy_omp, accuracy_lr, accuracy_ridge, accuracy_br, accuracy_hr, accuracy_lsvr]
})

# Print the DataFrame
results
results.plot(x='Model', y=['MSE', 'RMSE', 'Accuracy'], kind='bar')

results.style.highlight_min(subset=['MSE', 'RMSE', 'Accuracy'], color='yellow').highlight_max(subset=['MSE', 'RMSE', 'Accuracy'], color='red')
