# Import librairies

In [1]:
import numpy as np
import os
import pandas as pd
from sklearn import metrics

# PCA
from sklearn.decomposition import PCA

# GridSearch
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# Load data

In [2]:
train=pd.read_csv("../data/escp_training_set.csv")
test=pd.read_csv("../data/escp_test_set.csv")

In [3]:
combined_data = pd.concat([train, test])

In [4]:
combined_data.head()

Unnamed: 0,company_id,company_name,industry,sector,website,year_founded,employees,city,country_name,revenues,title,description,insta,twitter,fb,linkedin,description_twitter,audience,market_cap
0,28,01 Communique Laboratory,Computer Software,Technology,https://www.01com.com,1992.0,125,Toronto,Canada,709189.0,Post-Quantum Cybersecurity : 01com.com,,,,,linkedin.com/company/01-communique,,,17976400.0
1,190,1-800-Flowers.Com,Retail,Consumer Cyclical,https://www.1800flowers.com,1976.0,4800,Jericho,United States,2147850000.0,Flowers | Flower Delivery | Fresh Flowers Onli...,Send flowers and send a smile! Discover fresh ...,https://www.instagram.com/1800flowers,https://twitter.com/1800flowers,https://www.facebook.com/1800flowers,linkedin.com/company/1800flowers-com,,624395.0,1837320000.0
2,522,1000Mercis,Marketing And Advertising,Communication Services,https://numberly.com,2000.0,386,Paris,France,71950000.0,Accueil | Numberly,,,https://twitter.com/Numberly,https://www.facebook.com/Numberly1000mercis/,linkedin.com/company/1000mercis,Marketing Technologist — Omnichannel. Programm...,22773.1,52835500.0
3,639,104,Staffing And Recruiting,Industrials,https://corp.104.com.tw,,14,New Taipei City,Taiwan,60338200.0,corp.104.com.tw,,,,,,,,217495000.0
4,763,10X Genomics,Biotechnology,Healthcare,https://www.10xgenomics.com,2016.0,1148,Pleasanton,United States,459178000.0,Home Page - 10x Genomics,Resolving Biology to Advance Human Health,,https://twitter.com/10xGenomics,https://www.facebook.com/10xGenomics/,linkedin.com/company/readcoor,Powering researchers’ insights with innovative...,1583000.0,7873110000.0


In [5]:
from math import log

In [6]:
# Keep only relevant features
combined_data_3=combined_data[["industry","sector","year_founded","country_name","revenues","employees","market_cap","audience"]]
combined_data_3["age"]=combined_data_3["year_founded"].apply(lambda x: 2022-x if x else 5)
norm = {}
for col in ["revenues","employees","audience","age","market_cap"]:
    m = combined_data_3[col].mean()
    s = combined_data_3[col].std()
    norm[col] = {"mean":m,"std":s}
    combined_data_3[col] = (combined_data_3[col]-m)/s
#combined_data_3["market_cap"] = combined_data_3["market_cap"].apply(log)
combined_data_3=combined_data_3.drop(["year_founded"],axis=1)

# Fillna
fill_mode = lambda col: col.fillna(col.mode())
combined_data_3[["industry","sector","country_name"]]=combined_data_3[["industry","sector","country_name"]].apply(fill_mode, axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_data_3["age"]=combined_data_3["year_founded"].apply(lambda x: 2022-x if x else 5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_data_3[col] = (combined_data_3[col]-m)/s


In [7]:
#combined_data_3["age"] = combined_data_3["age"].fillna(5)

In [8]:
combined_data_3=pd.get_dummies(combined_data_3, columns=["industry","sector","country_name"])

In [9]:
train_data=combined_data_3.head(25221)
test_data=combined_data_3.tail(1494192)

In [10]:
train_data.head()

Unnamed: 0,revenues,employees,market_cap,audience,age,industry_Accounting,industry_Airlines/Aviation,industry_Alternative Dispute Resolution,industry_Alternative Medicine,industry_Animation,...,country_name_Uruguay,country_name_Uzbekistan,country_name_Vanuatu,country_name_Venezuela,country_name_Vietnam,country_name_West Bank and Gaza,country_name_Yemen,country_name_Zambia,country_name_Zimbabwe,country_name_Åland Islands
0,-0.043449,-0.021631,-0.082989,,0.112226,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.201693,1.0269,-0.049861,0.087475,0.731108,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.002136,0.036908,-0.082354,-0.036471,-0.197214,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.00887,-0.046526,-0.079356,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.22242,0.207813,0.060042,0.284967,-0.816095,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
train_data=train_data.dropna()

In [12]:
train_data.corr()

Unnamed: 0,revenues,employees,market_cap,audience,age,industry_Accounting,industry_Airlines/Aviation,industry_Alternative Dispute Resolution,industry_Alternative Medicine,industry_Animation,...,country_name_Uruguay,country_name_Uzbekistan,country_name_Vanuatu,country_name_Venezuela,country_name_Vietnam,country_name_West Bank and Gaza,country_name_Yemen,country_name_Zambia,country_name_Zimbabwe,country_name_Åland Islands
revenues,1.000000,0.713352,0.367401,0.444568,0.140391,-0.006600,-0.003937,,-0.005696,-0.003937,...,0.001697,,,,,,,,,
employees,0.713352,1.000000,0.223387,0.447550,0.134348,-0.006514,0.013621,,-0.005763,-0.003529,...,0.004523,,,,,,,,,
market_cap,0.367401,0.223387,1.000000,0.346324,0.037840,-0.002699,-0.005680,,-0.002424,-0.001790,...,0.004465,,,,,,,,,
audience,0.444568,0.447550,0.346324,1.000000,0.102565,-0.004983,0.015021,,-0.004012,-0.002549,...,0.023410,,,,,,,,,
age,0.140391,0.134348,0.037840,0.102565,1.000000,-0.002073,-0.013012,,-0.025989,-0.009327,...,-0.007040,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
country_name_West Bank and Gaza,,,,,,,,,,,...,,,,,,,,,,
country_name_Yemen,,,,,,,,,,,...,,,,,,,,,,
country_name_Zambia,,,,,,,,,,,...,,,,,,,,,,
country_name_Zimbabwe,,,,,,,,,,,...,,,,,,,,,,


## Multiple linear regression 

In [13]:
import statsmodels.api as sm # import statsmodels 
#,"employees"
#X = train_data[["revenues","audience","employees"]]
X = train_data.drop("market_cap", 1)## X usually means our input variables (or independent variables)
X = X[["revenues","age","audience","employees"]]
y = train_data["market_cap"] ## Y usually means our output/dependent variable


#X=df.drop(columns=["Sales","time_diff"])
#y=df["Sales"]




  X = train_data.drop("market_cap", 1)## X usually means our input variables (or independent variables)


In [14]:
#exog = sm.add_constant(X)
#model = sm.OLS(y,exog).fit()
# Note the difference in argument order
model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
predictions = model.predict(X)

# Print out the statistics
model.summary()




0,1,2,3
Dep. Variable:,market_cap,R-squared (uncentered):,0.19
Model:,OLS,Adj. R-squared (uncentered):,0.189
Method:,Least Squares,F-statistic:,399.5
Date:,"Fri, 20 May 2022",Prob (F-statistic):,1.07e-309
Time:,10:12:36,Log-Likelihood:,-13176.0
No. Observations:,6830,AIC:,26360.0
Df Residuals:,6826,BIC:,26390.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
revenues,0.0601,0.003,22.616,0.000,0.055,0.065
age,-0.0310,0.012,-2.610,0.009,-0.054,-0.008
audience,0.1783,0.009,20.222,0.000,0.161,0.196
employees,-0.0241,0.003,-9.161,0.000,-0.029,-0.019

0,1,2,3
Omnibus:,20149.33,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1217570616.275
Skew:,41.603,Prob(JB):,0.0
Kurtosis:,2069.763,Cond. No.,8.88


In [15]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(train_data["market_cap"],predictions, squared=False))

1.6657299767351708


In [16]:
train_data["market_cap"].mean()

0.1115383342066374

## Random forest 

In [None]:
# Labels are the values we want to predict
labels = y
# Remove the labels from the features
# axis 1 refers to the columns
features= X
# Saving feature names for later use
feature_list = list(X.columns)
# Convert to numpy array
features = np.array(features)


# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', mean_squared_error(test_labels,predictions, squared=False)/10e8 )





## Scoring the model over the test set

In [17]:
columns = ["revenues","audience","age","employees"]#
test_data= test_data[columns]


In [18]:
test_data.head()

Unnamed: 0,revenues,audience,age,employees
0,-0.038084,,,-0.038452
1,-0.006395,-0.040111,,-0.007725
2,-0.042343,,-0.119854,-0.047199
3,-0.043398,-0.039674,-0.777415,-0.048769
4,-0.041843,-0.03794,0.034866,-0.046751


In [19]:
for col in test_data.columns:
    test_data[col] = test_data[col].fillna(test_data[col].mean())

In [20]:
#for col in test_data.columns:
 #   test_data[col] = (test_data[col]-norm[col]["mean"])/norm[col]["std"]


In [21]:
test_data["valuation"]=model.predict(test_data)*norm["market_cap"]["std"]+norm["market_cap"]["mean"]

In [27]:
test_data["valuation"]

0          4.456479e+09
1          4.578551e+09
2          3.334800e+09
3         -3.048555e+09
4          4.905383e+09
               ...     
1494187   -2.673641e+09
1494188    1.886672e+10
1494189    1.056782e+10
1494190    7.711940e+08
1494191    4.467507e+09
Name: valuation, Length: 1494192, dtype: float64

In [28]:
a = test_data["valuation"]
test=pd.read_csv("../data/escp_test_set.csv")
test["valuation"] = a

In [29]:
test

Unnamed: 0,company_id,company_name,industry,sector,website,year_founded,employees,city,country_name,revenues,title,description,insta,twitter,fb,linkedin,description_twitter,audience,valuation
0,0,&,Marketing And Advertising,Communication Services,https://andco.dk,,50,Copenhagen,Denmark,9961710.0,&Co. Agency of the year 2017-21 - Home,,,,https://www.facebook.com/andcodk,linkedin.com/company/&-co.,,,4.456479e+09
1,1,0,Automotive,Consumer Cyclical,https://www.wyomingvalleymotors.com,,187,,,64606800.0,,,,,,linkedin.com/company/wyoming-valley-motors,,5102.66,4.578551e+09
2,2,"0,5 Ponto Pesquisa De Mercado",Market Research,Technology,http://meioponto.com.br,1998.0,11,Sao Paulo,Brazil,2617620.0,Meio Ponto 10 ANOS,,,,,linkedin.com/company/0-5-ponto-pesquisa-de-mer...,,,3.334800e+09
3,3,0.8L,Marketing And Advertising,Communication Services,https://08liter.com,2015.0,4,,,796936.0,"ie½i ee e³³, e³uie¬i","ie ei Ni¡i ii 'ie½'i ei ii¬e³¼ iiu, i e¬ei 1:1...",https://www.instagram.com/0.8l_korea,,https://www.facebook.com/08liter,linkedin.com/company/08liter.com,,7226.85,-3.048555e+09
4,4,0-Co2 | Architettura Sostenibile,Architecture & Planning,Industrials,https://0-co2.it,1994.0,13,,,3479710.0,0-co2 | architettura sostenibile-Bart Conterio...,,,https://twitter.com/BartConterio,https://www.facebook.com/pages/0-co2-architett...,linkedin.com/company/0-co2-architettura-sosten...,"#architect, specialized in #sustainable-design...",15644.40,4.905383e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1494187,1519408,_Socialstarters,Professional Training & Coaching,Industrials,https://www.socialstarters.org,2014.0,35,London,United Kingdom,6266440.0,Social Starters | Social Enterprise Volunteeri...,We offer short term social enterprise voluntee...,,https://twitter.com/_socialstarters,https://www.facebook.com/wearesocialstarters,linkedin.com/company/_socialstarters,Business & leadership mentoring to support the...,21885.10,-2.673641e+09
1494188,1519409,_Space Architecture & Management,Architecture & Planning,Industrials,http://creating-space.co.uk,1957.0,78,Tyne And Wear,United Kingdom,13965200.0,,,,,,linkedin.com/company/_space-architecture-&-man...,,,1.886672e+10
1494189,1519410,_Wige Media,Media Production,Communication Services,https://wige-solutions.de,1979.0,48,Cologne,Germany,27238100.0,wige SOLUTIONS,Die wige SOLUTIONS GmbH & Co. KG ist der Full-...,https://www.instagram.com/wigesolutions,,https://www.facebook.com/wigeSOLUTIONS,linkedin.com/company/wige-media-ag,,,1.056782e+10
1494190,1519411,____,Fine Art,Consumer Cyclical,https://espaciocontinuo.com,2005.0,165,,,41529000.0,,,https://www.instagram.com/andayah,,,linkedin.com/company/____,,2355.08,7.711940e+08


In [30]:
test.to_csv("test_prediction.csv")