In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, HistGradientBoostingClassifier

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import r2_score, f1_score, accuracy_score, make_scorer, roc_auc_score

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
RANDOM_STATE = 42

In [2]:
import sklearn
print(sklearn.__version__)

1.5.1


In [3]:
import imblearn
print(imblearn.__version__)

0.12.3


In [4]:
from imblearn.over_sampling import RandomOverSampler

In [5]:
#!pip install lightgbm
import lightgbm as lgb 
from lightgbm import LGBMClassifier

In [6]:
train_data = pd.read_csv('datasets\kaggle_startups_train_28062024.csv')

display(train_data.head(5))
train_data.info()

Unnamed: 0,name,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at,closed_at
0,Lunchgate,Online Reservations|Restaurants,828626.0,operating,CHE,25,Zurich,Zürich,2,2009-10-17,2011-05-01,2014-12-01,
1,EarLens,Manufacturing|Medical|Medical Devices,42935019.0,operating,USA,CA,SF Bay Area,Redwood City,4,2005-01-01,2010-05-04,2014-02-25,
2,Reviva Pharmaceuticals,Biotechnology,35456381.0,operating,USA,CA,SF Bay Area,San Jose,3,2006-01-01,2012-08-20,2014-07-02,
3,Sancilio and Company,Health Care,22250000.0,operating,,,,,3,2004-01-01,2011-09-01,2014-07-18,
4,WireTough Cylinders,Manufacturing,,operating,USA,VA,VA - Other,Bristol,1,2010-05-12,2012-02-01,2012-02-01,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52516 entries, 0 to 52515
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               52515 non-null  object 
 1   category_list      50051 non-null  object 
 2   funding_total_usd  42447 non-null  float64
 3   status             52516 non-null  object 
 4   country_code       47014 non-null  object 
 5   state_code         45753 non-null  object 
 6   region             46157 non-null  object 
 7   city               46157 non-null  object 
 8   funding_rounds     52516 non-null  int64  
 9   founded_at         52516 non-null  object 
 10  first_funding_at   52516 non-null  object 
 11  last_funding_at    52516 non-null  object 
 12  closed_at          4917 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 5.2+ MB


In [7]:
def split_column(df, column, new_columns, delimiter='|'):
    # Разделяем строку на отдельные элементы
    split_data = df[column].str.split(delimiter, expand=True)
    
    # Присваиваем новые столбцы
    for i, new_column in enumerate(new_columns):
        df[new_column] = split_data[i]
    
    return df

# Задаем новые столбцы
new_columns = ['first', 'second', 'third']

# Применяем функцию к DataFrame
df = split_column(train_data, 'category_list', new_columns)

df

Unnamed: 0,name,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at,closed_at,first,second,third
0,Lunchgate,Online Reservations|Restaurants,828626.0,operating,CHE,25,Zurich,Zürich,2,2009-10-17,2011-05-01,2014-12-01,,Online Reservations,Restaurants,
1,EarLens,Manufacturing|Medical|Medical Devices,42935019.0,operating,USA,CA,SF Bay Area,Redwood City,4,2005-01-01,2010-05-04,2014-02-25,,Manufacturing,Medical,Medical Devices
2,Reviva Pharmaceuticals,Biotechnology,35456381.0,operating,USA,CA,SF Bay Area,San Jose,3,2006-01-01,2012-08-20,2014-07-02,,Biotechnology,,
3,Sancilio and Company,Health Care,22250000.0,operating,,,,,3,2004-01-01,2011-09-01,2014-07-18,,Health Care,,
4,WireTough Cylinders,Manufacturing,,operating,USA,VA,VA - Other,Bristol,1,2010-05-12,2012-02-01,2012-02-01,,Manufacturing,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52511,Videostream,Entertainment,,operating,CAN,ON,Toronto,Kitchener,1,2012-01-01,2014-03-01,2014-03-01,,Entertainment,,
52512,Hello Curry,Hospitality,500000.0,operating,IND,2,Hyderabad,Hyderabad,1,2013-08-25,2014-03-07,2014-03-07,,Hospitality,,
52513,Taskforce,Email|Messaging|Productivity Software,50000.0,operating,USA,CA,SF Bay Area,San Francisco,3,2010-07-01,2009-06-14,2011-01-01,,Email,Messaging,Productivity Software
52514,NetScaler,Security,13000000.0,operating,USA,CA,SF Bay Area,San Jose,6,1997-12-01,1998-11-30,2004-03-01,,Security,,


In [8]:
!pip install phik -q 
import phik 

In [9]:
df['third'].value_counts()

third
Software              1016
Mobile                 869
Social Media           503
Services               480
Technology             476
                      ... 
Made in Italy            1
Coffee                   1
Prediction Markets       1
Charity                  1
Distributors             1
Name: count, Length: 735, dtype: int64

In [11]:
test_data = pd.read_csv('datasets\kaggle_startups_test_28062024.csv')

display(test_data.head(5))
test_data.info()

Unnamed: 0,name,category_list,funding_total_usd,country_code,state_code,region,city,funding_rounds,first_funding_at,last_funding_at,lifetime
0,Crystalsol,Clean Technology,2819200.0,NIC,17,,,1,2009-07-01,2009-07-01,3501
1,JBI Fish & Wings,Hospitality,,USA,TN,TN - Other,Humboldt,1,2010-07-28,2010-07-28,2717
2,COINPLUS,Finance,428257.0,LUX,3,Esch-sur-alzette,Esch-sur-alzette,2,2014-05-15,2014-09-18,1295
3,Imagine Communications,Software|Video|Video Streaming,34700000.0,USA,CA,San Diego,San Diego,4,2005-01-01,2010-04-20,4748
4,DNA13,Software,4530000.0,CAN,ON,Ottawa,Ottawa,1,2007-05-08,2007-05-08,6209


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13125 entries, 0 to 13124
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               13125 non-null  object 
 1   category_list      12534 non-null  object 
 2   funding_total_usd  10547 non-null  float64
 3   country_code       11743 non-null  object 
 4   state_code         11430 non-null  object 
 5   region             11536 non-null  object 
 6   city               11538 non-null  object 
 7   funding_rounds     13125 non-null  int64  
 8   first_funding_at   13125 non-null  object 
 9   last_funding_at    13125 non-null  object 
 10  lifetime           13125 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 1.1+ MB


In [12]:
train_data['closed_at'] = train_data['closed_at'].fillna('2017-12-31')

train_data['first_funding_at'] = pd.to_datetime(train_data['first_funding_at'], format='%Y-%m-%d')
train_data['last_funding_at'] = pd.to_datetime(train_data['last_funding_at'], format='%Y-%m-%d')
train_data['time_funding'] = train_data['last_funding_at'] - train_data['first_funding_at']

train_data['time_funding'] = train_data['time_funding'].astype('int64') // 10**9 // 86400

In [13]:
train_data['founded_at'] = pd.to_datetime(train_data['founded_at'], format='%Y-%m-%d')
train_data['closed_at'] = pd.to_datetime(train_data['closed_at'], format='%Y-%m-%d')
train_data['lifetime'] = train_data['closed_at'] - train_data['founded_at']

train_data['lifetime'] = train_data['lifetime'].astype('int64') // 10**9 // 86400

In [14]:
display(train_data.head(5))
train_data.info()

Unnamed: 0,name,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at,closed_at,first,second,third,time_funding,lifetime
0,Lunchgate,Online Reservations|Restaurants,828626.0,operating,CHE,25,Zurich,Zürich,2,2009-10-17,2011-05-01,2014-12-01,2017-12-31,Online Reservations,Restaurants,,1310,2997
1,EarLens,Manufacturing|Medical|Medical Devices,42935019.0,operating,USA,CA,SF Bay Area,Redwood City,4,2005-01-01,2010-05-04,2014-02-25,2017-12-31,Manufacturing,Medical,Medical Devices,1393,4747
2,Reviva Pharmaceuticals,Biotechnology,35456381.0,operating,USA,CA,SF Bay Area,San Jose,3,2006-01-01,2012-08-20,2014-07-02,2017-12-31,Biotechnology,,,681,4382
3,Sancilio and Company,Health Care,22250000.0,operating,,,,,3,2004-01-01,2011-09-01,2014-07-18,2017-12-31,Health Care,,,1051,5113
4,WireTough Cylinders,Manufacturing,,operating,USA,VA,VA - Other,Bristol,1,2010-05-12,2012-02-01,2012-02-01,2017-12-31,Manufacturing,,,0,2790


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52516 entries, 0 to 52515
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   name               52515 non-null  object        
 1   category_list      50051 non-null  object        
 2   funding_total_usd  42447 non-null  float64       
 3   status             52516 non-null  object        
 4   country_code       47014 non-null  object        
 5   state_code         45753 non-null  object        
 6   region             46157 non-null  object        
 7   city               46157 non-null  object        
 8   funding_rounds     52516 non-null  int64         
 9   founded_at         52516 non-null  datetime64[ns]
 10  first_funding_at   52516 non-null  datetime64[ns]
 11  last_funding_at    52516 non-null  datetime64[ns]
 12  closed_at          52516 non-null  datetime64[ns]
 13  first              50051 non-null  object        
 14  second

In [15]:
train_data.columns

Index(['name', 'category_list', 'funding_total_usd', 'status', 'country_code',
       'state_code', 'region', 'city', 'funding_rounds', 'founded_at',
       'first_funding_at', 'last_funding_at', 'closed_at', 'first', 'second',
       'third', 'time_funding', 'lifetime'],
      dtype='object')

In [16]:
test_data['first_funding_at'] = pd.to_datetime(test_data['first_funding_at'], format='%Y-%m-%d')
test_data['last_funding_at'] = pd.to_datetime(test_data['last_funding_at'], format='%Y-%m-%d')
test_data['time_funding'] = test_data['last_funding_at'] - test_data['first_funding_at']

test_data['time_funding'] = test_data['time_funding'].astype('int64') // 10**9 // 86400

In [17]:
display(test_data.head(5))
test_data.info()

Unnamed: 0,name,category_list,funding_total_usd,country_code,state_code,region,city,funding_rounds,first_funding_at,last_funding_at,lifetime,time_funding
0,Crystalsol,Clean Technology,2819200.0,NIC,17,,,1,2009-07-01,2009-07-01,3501,0
1,JBI Fish & Wings,Hospitality,,USA,TN,TN - Other,Humboldt,1,2010-07-28,2010-07-28,2717,0
2,COINPLUS,Finance,428257.0,LUX,3,Esch-sur-alzette,Esch-sur-alzette,2,2014-05-15,2014-09-18,1295,126
3,Imagine Communications,Software|Video|Video Streaming,34700000.0,USA,CA,San Diego,San Diego,4,2005-01-01,2010-04-20,4748,1935
4,DNA13,Software,4530000.0,CAN,ON,Ottawa,Ottawa,1,2007-05-08,2007-05-08,6209,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13125 entries, 0 to 13124
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   name               13125 non-null  object        
 1   category_list      12534 non-null  object        
 2   funding_total_usd  10547 non-null  float64       
 3   country_code       11743 non-null  object        
 4   state_code         11430 non-null  object        
 5   region             11536 non-null  object        
 6   city               11538 non-null  object        
 7   funding_rounds     13125 non-null  int64         
 8   first_funding_at   13125 non-null  datetime64[ns]
 9   last_funding_at    13125 non-null  datetime64[ns]
 10  lifetime           13125 non-null  int64         
 11  time_funding       13125 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(3), object(6)
memory usage: 1.2+ MB


In [18]:
train_data = train_data[['name', 
                         'category_list', 
                         'funding_total_usd', 
                         'country_code',
                         'funding_rounds', 
                         'time_funding',
                         'lifetime',
                         'status']]

In [19]:
train_data

Unnamed: 0,name,category_list,funding_total_usd,country_code,funding_rounds,time_funding,lifetime,status
0,Lunchgate,Online Reservations|Restaurants,828626.0,CHE,2,1310,2997,operating
1,EarLens,Manufacturing|Medical|Medical Devices,42935019.0,USA,4,1393,4747,operating
2,Reviva Pharmaceuticals,Biotechnology,35456381.0,USA,3,681,4382,operating
3,Sancilio and Company,Health Care,22250000.0,,3,1051,5113,operating
4,WireTough Cylinders,Manufacturing,,USA,1,0,2790,operating
...,...,...,...,...,...,...,...,...
52511,Videostream,Entertainment,,CAN,1,0,2191,operating
52512,Hello Curry,Hospitality,500000.0,IND,1,0,1589,operating
52513,Taskforce,Email|Messaging|Productivity Software,50000.0,USA,3,566,2740,operating
52514,NetScaler,Security,13000000.0,USA,6,1918,7335,operating


In [20]:
test_data = test_data[['name', 
                         'category_list', 
                         'funding_total_usd', 
                         'country_code',
                         'funding_rounds', 
                         'time_funding',
                         'lifetime']]

In [21]:
train_data ['feat'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data ['feat'] = 0


In [22]:
test_data ['feat'] = 1

In [23]:
test_data.shape

(13125, 8)

In [24]:
train_data.shape

(52516, 9)

In [25]:
integ = pd.concat([train_data, test_data], ignore_index=True)

In [26]:
integ

Unnamed: 0,name,category_list,funding_total_usd,country_code,funding_rounds,time_funding,lifetime,status,feat
0,Lunchgate,Online Reservations|Restaurants,828626.0,CHE,2,1310,2997,operating,0
1,EarLens,Manufacturing|Medical|Medical Devices,42935019.0,USA,4,1393,4747,operating,0
2,Reviva Pharmaceuticals,Biotechnology,35456381.0,USA,3,681,4382,operating,0
3,Sancilio and Company,Health Care,22250000.0,,3,1051,5113,operating,0
4,WireTough Cylinders,Manufacturing,,USA,1,0,2790,operating,0
...,...,...,...,...,...,...,...,...,...
65636,Jymob,Recruiting,28000.0,USA,1,0,2267,,1
65637,GlobaTrek,Software,,USA,1,0,2192,,1
65638,Inkd.com,Design|E-Commerce|Marketplaces|Printing,5600000.0,USA,3,1030,2922,,1
65639,AdWill,Advertising Exchanges|Mobile|Mobile Advertising,150000.0,USA,2,38,1386,,1


In [27]:
integ.shape

(65641, 9)

In [28]:
integ_shuffled = integ.sample(frac=1).reset_index(drop=True)
integ

Unnamed: 0,name,category_list,funding_total_usd,country_code,funding_rounds,time_funding,lifetime,status,feat
0,Lunchgate,Online Reservations|Restaurants,828626.0,CHE,2,1310,2997,operating,0
1,EarLens,Manufacturing|Medical|Medical Devices,42935019.0,USA,4,1393,4747,operating,0
2,Reviva Pharmaceuticals,Biotechnology,35456381.0,USA,3,681,4382,operating,0
3,Sancilio and Company,Health Care,22250000.0,,3,1051,5113,operating,0
4,WireTough Cylinders,Manufacturing,,USA,1,0,2790,operating,0
...,...,...,...,...,...,...,...,...,...
65636,Jymob,Recruiting,28000.0,USA,1,0,2267,,1
65637,GlobaTrek,Software,,USA,1,0,2192,,1
65638,Inkd.com,Design|E-Commerce|Marketplaces|Printing,5600000.0,USA,3,1030,2922,,1
65639,AdWill,Advertising Exchanges|Mobile|Mobile Advertising,150000.0,USA,2,38,1386,,1


In [29]:
integ_shuffled

Unnamed: 0,name,category_list,funding_total_usd,country_code,funding_rounds,time_funding,lifetime,status,feat
0,Bill Me Later,Credit|E-Commerce|Payments,100000000.0,USA,3,640,6574,operating,0
1,NY Slice,Delivery|Franchises|Restaurants,470248.0,,1,0,1326,,1
2,lifeIMAGE,Health Care,62669579.0,USA,10,2303,3652,operating,0
3,Balaya,Public Relations,155000.0,,1,0,1772,,1
4,Minneapolis Biomass Exchange,Curated Web,65000.0,USA,1,0,3105,operating,0
...,...,...,...,...,...,...,...,...,...
65636,Qustodio,SaaS|Security|Software,2500000.0,ESP,2,477,1933,operating,0
65637,Coleccionarte,Art|Artists Globally|Communities,50000.0,,1,0,2921,operating,0
65638,NephroGenex,Biotechnology|Health Care,21800000.0,USA,4,3052,4261,operating,0
65639,Hiddenbed,Furniture,700000.0,,1,0,5965,operating,0


In [30]:
# создаём списки с названиями входящих признаков для модели
# номинальные категориальные
ord_columns = [

    'country_code'
]

# количественные
num_columns = [

    'funding_rounds', 
    'time_funding',
    'lifetime'
]

all_input_features = ord_columns + num_columns

# разделяем входящие признаки и таргет в тренировочном датасете
X = integ_shuffled[all_input_features]
y = integ_shuffled['feat']

In [31]:
# Предобработка категориальных признаков
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='other')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)),
    ('imputer2', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1))
])

# Предобработка числовых признаков
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', 'passthrough')  # Placeholder for scaler
])

# Объединение предобработки
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, ord_columns),
        ('num', numeric_transformer, num_columns),
    ])

# Создание пайплайна с моделью
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('models', LGBMClassifier())
])

In [32]:
integ['feat'].value_counts(normalize=True)

feat
0    0.800049
1    0.199951
Name: proportion, dtype: float64

In [33]:
sampler = RandomOverSampler(random_state=42)
X_resample, y_resample = sampler.fit_resample(X, y)

pd.DataFrame(y_resample).value_counts(normalize=True)

feat
0       0.5
1       0.5
Name: proportion, dtype: float64

In [56]:
X_resample.shape

(3074, 4)

In [34]:
# cловари гиперпараметров для моделей
param_grid = [

      {
        'models': [LGBMClassifier(random_state=42)],
        'models__num_leaves': [30],
        'models__learning_rate': [0.1],
        'models__n_estimators': [100],
        'models__max_depth': [30],
        'models__lambda_l2': [0],
        'models__lambda_l1': [0],
       
        'preprocessor__num': [StandardScaler(), MinMaxScaler(), 'passthrough']  
    }
    
    
]

In [35]:
# поиск лучшей модели и ее гиперпараметров
model = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

model.fit(X_resample, y_resample)
print(f'Лучшая модель: {model.best_estimator_.named_steps["models"]}')
print(f'Метрика roc_auc лучшей модели по результатам кросс-валидации: {round(model.best_score_, 2)}')

[LightGBM] [Info] Number of positive: 52516, number of negative: 52516
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 643
[LightGBM] [Info] Number of data points in the train set: 105032, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Лучшая модель: LGBMClassifier(lambda_l1=0, lambda_l2=0, max_depth=30, num_leaves=30,
               random_state=42)
Метрика roc_auc лучшей модели по результатам кросс-валидации: 0.84


In [36]:
probabilities = model.predict_proba(integ)



In [37]:
probabilities

array([[0.58301041, 0.41698959],
       [0.92381082, 0.07618918],
       [0.91526603, 0.08473397],
       ...,
       [0.0545855 , 0.9454145 ],
       [0.50186624, 0.49813376],
       [0.04207754, 0.95792246]])

In [38]:
len(probabilities[:,0])

65641

In [39]:
integ['prob'] = probabilities[:,1]

In [40]:
integ

Unnamed: 0,name,category_list,funding_total_usd,country_code,funding_rounds,time_funding,lifetime,status,feat,prob
0,Lunchgate,Online Reservations|Restaurants,828626.0,CHE,2,1310,2997,operating,0,0.416990
1,EarLens,Manufacturing|Medical|Medical Devices,42935019.0,USA,4,1393,4747,operating,0,0.076189
2,Reviva Pharmaceuticals,Biotechnology,35456381.0,USA,3,681,4382,operating,0,0.084734
3,Sancilio and Company,Health Care,22250000.0,,3,1051,5113,operating,0,0.042485
4,WireTough Cylinders,Manufacturing,,USA,1,0,2790,operating,0,0.510859
...,...,...,...,...,...,...,...,...,...,...
65636,Jymob,Recruiting,28000.0,USA,1,0,2267,,1,0.607769
65637,GlobaTrek,Software,,USA,1,0,2192,,1,0.966637
65638,Inkd.com,Design|E-Commerce|Marketplaces|Printing,5600000.0,USA,3,1030,2922,,1,0.945415
65639,AdWill,Advertising Exchanges|Mobile|Mobile Advertising,150000.0,USA,2,38,1386,,1,0.498134


In [41]:
chosen = integ.query('feat == 0').query('prob > 0.55')
chosen

Unnamed: 0,name,category_list,funding_total_usd,country_code,funding_rounds,time_funding,lifetime,status,feat,prob
38,Yarwoods Martial Arts,Fitness|Sports,1200.0,USA,1,0,1555,operating,0,0.552335
72,Oceanlinx,Clean Technology,,AUS,1,0,3858,operating,0,0.585532
89,Syrenaica,Business Services|Defense|Games|Software|Techn...,40000.0,CHL,1,0,2258,operating,0,0.616842
95,iBid2Save,Advertising|Auctions|E-Commerce|Internet Marke...,550000.0,USA,2,230,2554,operating,0,0.576562
142,Culhanimal Productions,,25000.0,,1,0,514,closed,0,0.558222
...,...,...,...,...,...,...,...,...,...,...
52319,Sleeping On Air,,,USA,1,0,1375,operating,0,0.582216
52350,Smallaa,Facebook Applications|Interest Graph|Networkin...,3000000.0,USA,1,0,521,closed,0,0.558222
52386,Danger,Software,118300000.0,USA,5,2251,6605,operating,0,0.611733
52467,Thermal Nomad,,,USA,1,0,1351,operating,0,0.555497


In [42]:
chosen['status'].value_counts(normalize=True)

status
operating    0.80303
closed       0.19697
Name: proportion, dtype: float64

In [43]:
# модель прогнозирования

In [44]:
# создаём списки с названиями входящих признаков для модели
# номинальные категориальные
ord_columns = [

    'country_code'
]

# количественные
num_columns = [

    'funding_rounds', 
    'time_funding',
    'lifetime'
]

all_input_features = ord_columns + num_columns

# разделяем входящие признаки и таргет в тренировочном датасете
X = chosen[all_input_features]
y = chosen['status']

# кодируем таргет
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

In [45]:
# Предобработка категориальных признаков
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='other')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)),
    ('imputer2', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1))
])

# Предобработка числовых признаков
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', 'passthrough')  # Placeholder for scaler
])

# Объединение предобработки
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, ord_columns),
        ('num', numeric_transformer, num_columns),
    ])

# Создание пайплайна с моделью
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('models', LGBMClassifier())
])

In [46]:
sampler_2 = RandomOverSampler(random_state=42)
X_resample, y_resample = sampler_2.fit_resample(X, y)

pd.DataFrame(y_resample).value_counts(normalize=True)

0    0.5
1    0.5
Name: proportion, dtype: float64

In [47]:
param_grid = [

      {
        'models': [LGBMClassifier(random_state=42)],
        'models__num_leaves': [10],
        'models__learning_rate': [0.1],
        'models__n_estimators': [100],
        'models__max_depth': [20],
        'models__lambda_l2': [0],
        'models__lambda_l1': [0],
       
        'preprocessor__num': [StandardScaler(), MinMaxScaler(), 'passthrough']  
    }
    
    
]

In [48]:
# поиск лучшей модели и ее гиперпараметров
model2 = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=5,
    scoring='f1',
    n_jobs=-1
)

model2.fit(X_resample, y_resample)
print(f'Лучшая модель: {model2.best_estimator_.named_steps["models"]}')
print(f'Метрика f1 лучшей модели по результатам кросс-валидации: {round(model2.best_score_, 2)}')

[LightGBM] [Info] Number of positive: 1537, number of negative: 1537
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 566
[LightGBM] [Info] Number of data points in the train set: 3074, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Лучшая модель: LGBMClassifier(lambda_l1=0, lambda_l2=0, max_depth=20, num_leaves=10,
               random_state=42)
Метрика f1 лучшей модели по результатам кросс-валидации: 0.91


In [49]:
y_pred = model2.predict(test_data[all_input_features])
y_pred



array([1, 1, 1, ..., 1, 1, 1])

In [50]:
test_data['status'] = pd.DataFrame(y_pred)
test_data

Unnamed: 0,name,category_list,funding_total_usd,country_code,funding_rounds,time_funding,lifetime,feat,status
0,Crystalsol,Clean Technology,2819200.0,NIC,1,0,3501,1,1
1,JBI Fish & Wings,Hospitality,,USA,1,0,2717,1,1
2,COINPLUS,Finance,428257.0,LUX,2,126,1295,1,1
3,Imagine Communications,Software|Video|Video Streaming,34700000.0,USA,4,1935,4748,1,1
4,DNA13,Software,4530000.0,CAN,1,0,6209,1,1
...,...,...,...,...,...,...,...,...,...
13120,Jymob,Recruiting,28000.0,USA,1,0,2267,1,1
13121,GlobaTrek,Software,,USA,1,0,2192,1,1
13122,Inkd.com,Design|E-Commerce|Marketplaces|Printing,5600000.0,USA,3,1030,2922,1,1
13123,AdWill,Advertising Exchanges|Mobile|Mobile Advertising,150000.0,USA,2,38,1386,1,1


In [51]:
test_data ['status'] = test_data['status'].apply(lambda x: 'operating' if x == 1 else 'closed')
test_data

Unnamed: 0,name,category_list,funding_total_usd,country_code,funding_rounds,time_funding,lifetime,feat,status
0,Crystalsol,Clean Technology,2819200.0,NIC,1,0,3501,1,operating
1,JBI Fish & Wings,Hospitality,,USA,1,0,2717,1,operating
2,COINPLUS,Finance,428257.0,LUX,2,126,1295,1,operating
3,Imagine Communications,Software|Video|Video Streaming,34700000.0,USA,4,1935,4748,1,operating
4,DNA13,Software,4530000.0,CAN,1,0,6209,1,operating
...,...,...,...,...,...,...,...,...,...
13120,Jymob,Recruiting,28000.0,USA,1,0,2267,1,operating
13121,GlobaTrek,Software,,USA,1,0,2192,1,operating
13122,Inkd.com,Design|E-Commerce|Marketplaces|Printing,5600000.0,USA,3,1030,2922,1,operating
13123,AdWill,Advertising Exchanges|Mobile|Mobile Advertising,150000.0,USA,2,38,1386,1,operating


In [52]:
test_data['status'].value_counts(normalize=True)

status
operating    0.8448
closed       0.1552
Name: proportion, dtype: float64

In [53]:
feedback = test_data [['name', 'status']]
feedback

Unnamed: 0,name,status
0,Crystalsol,operating
1,JBI Fish & Wings,operating
2,COINPLUS,operating
3,Imagine Communications,operating
4,DNA13,operating
...,...,...
13120,Jymob,operating
13121,GlobaTrek,operating
13122,Inkd.com,operating
13123,AdWill,operating


In [54]:
feedback.to_csv(r'C:\Users\Алекс\Desktop\projects\startups_closing\datasets\feedback.csv', index=False)