## Getting Started

In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv('Fortune_1000.csv')

In [3]:
data

Unnamed: 0,company,rank,rank_change,revenue,profit,num. of employees,sector,city,state,newcomer,ceo_founder,ceo_woman,profitable,prev_rank,CEO,Website,Ticker,Market Cap
0,Walmart,1,0.0,523964.0,14881.0,2200000,Retailing,Bentonville,AR,no,no,no,yes,1.0,C. Douglas McMillon,https://www.stock.walmart.com,WMT,411690
1,Amazon,2,3.0,280522.0,11588.0,798000,Retailing,Seattle,WA,no,yes,no,yes,5.0,Jeffrey P. Bezos,https://www.amazon.com,AMZN,1637405
2,Exxon Mobil,3,-1.0,264938.0,14340.0,74900,Energy,Irving,TX,no,no,no,yes,2.0,Darren W. Woods,https://www.exxonmobil.com,XOM,177923
3,Apple,4,-1.0,260174.0,55256.0,137000,Technology,Cupertino,CA,no,no,no,yes,3.0,Timothy D. Cook,https://www.apple.com,AAPL,2221176
4,CVS Health,5,3.0,256776.0,6634.0,290000,Health Care,Woonsocket,RI,no,no,yes,yes,8.0,Karen S. Lynch,https://www.cvshealth.com,CVS,98496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Mr. Cooper Group,996,0.0,2007.0,274.0,9100,Financials,Coppell,TX,,no,no,yes,,Jay Bray,https://mrcoopergroup.com,COOP,674.1
996,Herc Holdings,997,0.0,1999.0,47.5,5100,Business Services,Bonita Springs,FL,,no,no,yes,,Lawrence H. Silber,https://www.hercrentals.com,HRI,590.5
997,Healthpeak Properties,998,0.0,1997.4,45.5,204,Financials,Irvine,CA,,no,no,yes,,Thomas M. Herzog,https://www.hcpi.com,PEAK,12059.3
998,SPX FLOW,999,0.0,1996.3,-95.1,5000,Industrials,Charlotte,NC,,no,no,no,,Marcus G. Michael,https://www.spxflow.com,FLOW,1211.8


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   company            1000 non-null   object 
 1   rank               1000 non-null   int64  
 2   rank_change        1000 non-null   float64
 3   revenue            1000 non-null   float64
 4   profit             998 non-null    float64
 5   num. of employees  1000 non-null   int64  
 6   sector             1000 non-null   object 
 7   city               1000 non-null   object 
 8   state              1000 non-null   object 
 9   newcomer           500 non-null    object 
 10  ceo_founder        1000 non-null   object 
 11  ceo_woman          1000 non-null   object 
 12  profitable         1000 non-null   object 
 13  prev_rank          1000 non-null   object 
 14  CEO                992 non-null    object 
 15  Website            1000 non-null   object 
 16  Ticker             938 no

## Preprocessing

In [51]:
def preprocess_inputs(df):
    df = df.copy()

    # drop unused columns
    df = df.drop(['company', 'rank', 'rank_change', 'newcomer', 'prev_rank', 'CEO', 'Website', 'Ticker'], axis=1)

    # encode missing values
    df['Market Cap'] = df['Market Cap'].replace('-', np.NaN).astype(np.float)

    # drop missing target rows
    missing_target_rows = df[df['Market Cap'].isna()].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)

    # fill remaining missing values
    df['profit'] = df['profit'].fillna(df['profit'].mean())

    # binary encoding
    for column in ['ceo_founder','ceo_woman','profitable']:
        df[column] = df[column].replace({'no':0, 'yes':1})

    # onehot encoding
    for column in ['sector', 'city', 'state']:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)

    # split into 'X' & 'y'
    X = df.drop('Market Cap', axis=1)
    y = df['Market Cap']

    # train-test-split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

    # scale X
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test

#### Column removal observation;
- Removing __'company'__ column as it just includes names of the company and has unique values which contribute none to the prediction model
- 'rank' column correlates to 'revenue' and not to 'Market Cap' which is our target variable so 'rank' does not interfere in our model (columns correlating to target variable must be removed as it could leak data to the prediction model and make the model redundant), however 'revenue' is already representing the distance between companies in revenue, so it doesn't make sense to have 'rank' column and 'rank_change' column. So we are better removing __'rank'__ and __'rank_change'__ column 
- Removing __'newcomer'__ column as it has only 500 records of the dataset which is 50%. Imputing it with mode will create confusion in the prediction model as it contains most of the fabricated data
- 'prev_rank' column contains most of the data as blank space and if we replace those columns with NaN values, we come to know that there are about 523 NaN values which is a lot of records gone missing, so we might just remove __'prev_rank'__ column
- Removing __'CEO'__ column as it just includes names of CEO of the company and has unique values which contribute none to the prediction model
- Removing __'Website'__ column as it just includes website of the company and has unique values which contribute none to the prediction model
- Removing __'Ticker'__ column as it just includes ticker of the company and has unique values which contribute none to the prediction model

#### Encoding observation
- Binary encoding __'ceo_founder','ceo_woman','profitable'__ columns which consists of only two classes
- OneHotEncoding __'sector', 'city', 'state'__ columns

#### Missing values observation;
- 'Market Cap' column contains 40 missing values, so we are dropping it
- 'Market Cap' column is an object data type but it actually contains numerical values, so we are going to typecast it. After typecasting, there are '-' values in the data, so we are going to replace those values with NaN values and drop those as well

In [52]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [53]:
X_train.describe()

Unnamed: 0,revenue,profit,num. of employees,ceo_founder,ceo_woman,profitable,sector_Aerospace & Defense,sector_Apparel,sector_Business Services,sector_Chemicals,...,state_PA,state_PR,state_RI,state_SC,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI
count,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,...,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0
mean,-7.479397e-17,2.1369710000000002e-17,-2.6712130000000002e-17,-2.4040920000000002e-17,2.1369710000000002e-17,1.148622e-16,5.342427e-18,-4.00682e-17,5.876669000000001e-17,2.671213e-18,...,6.410912e-17,-2.4040920000000002e-17,-1.068485e-17,-5.342427e-18,-1.3356070000000002e-17,8.815004e-17,0.0,-5.342427e-18,3.205456e-17,1.602728e-17
std,1.000753,1.000753,1.000753,1.000753,1.000753,1.000753,1.000753,1.000753,1.000753,1.000753,...,1.000753,1.000753,1.000753,1.000753,1.000753,1.000753,1.000753,1.000753,1.000753,1.000753
min,-0.3890891,-1.627735,-0.3504698,-0.2285064,-0.2496003,-2.419849,-0.1619709,-0.1296903,-0.2392357,-0.1760902,...,-0.209657,-0.03880753,-0.03880753,-0.03880753,-0.1412043,-0.356537,-0.038808,-0.1976424,-0.1235604,-0.1570137
25%,-0.3560759,-0.2630379,-0.2887698,-0.2285064,-0.2496003,0.4132489,-0.1619709,-0.1296903,-0.2392357,-0.1760902,...,-0.209657,-0.03880753,-0.03880753,-0.03880753,-0.1412043,-0.356537,-0.038808,-0.1976424,-0.1235604,-0.1570137
50%,-0.2830086,-0.2000001,-0.2276602,-0.2285064,-0.2496003,0.4132489,-0.1619709,-0.1296903,-0.2392357,-0.1760902,...,-0.209657,-0.03880753,-0.03880753,-0.03880753,-0.1412043,-0.356537,-0.038808,-0.1976424,-0.1235604,-0.1570137
75%,-0.08492612,-0.05587917,-0.06663828,-0.2285064,-0.2496003,0.4132489,-0.1619709,-0.1296903,-0.2392357,-0.1760902,...,-0.209657,-0.03880753,-0.03880753,-0.03880753,-0.1412043,-0.356537,-0.038808,-0.1976424,-0.1235604,-0.1570137
max,14.28774,17.60714,20.26823,4.376244,4.006405,0.4132489,6.173949,7.710677,4.179979,5.678908,...,4.769696,25.7682,25.7682,25.7682,7.081938,2.804758,25.768197,5.059644,8.093207,6.36887


## Model Training

In [57]:
models = {
    "    Linear Regression": LinearRegression(),
    "Linear Regression(L2)": Ridge(),
    "Linear Regression(L3)": Lasso(),
    "        Decision Tree": DecisionTreeRegressor(),
    "       Neural Network": MLPRegressor(),
    "        Random Forest": RandomForestRegressor(),
    "    Gradient Boosting": GradientBoostingRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + ' trained successfully...')

    Linear Regression trained successfully...
Linear Regression(L2) trained successfully...
Linear Regression(L3) trained successfully...
        Decision Tree trained successfully...
       Neural Network trained successfully...
        Random Forest trained successfully...
    Gradient Boosting trained successfully...


## Results

In [58]:
## rmse
for name, model in models.items():
    y_pred = model.predict(X_test)
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    print(name + " RMSE: {:.2f}".format(rmse))

    Linear Regression RMSE: 21073042181926162432.00
Linear Regression(L2) RMSE: 150287.44
Linear Regression(L3) RMSE: 148364.67
        Decision Tree RMSE: 111966.64
       Neural Network RMSE: 149455.61
        Random Forest RMSE: 91313.45
    Gradient Boosting RMSE: 94796.40


In [59]:
## r2
for name, model in models.items():
    r2 = model.score(X_test, y_test)
    print(name + " R^2: {:.2f}".format(r2))

    Linear Regression R^2: -21049272627703147880656142336.00
Linear Regression(L2) R^2: -0.07
Linear Regression(L3) R^2: -0.04
        Decision Tree R^2: 0.41
       Neural Network R^2: -0.06
        Random Forest R^2: 0.60
    Gradient Boosting R^2: 0.57
