In [3]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

import pandas as pd

def encode_text_values(df, feature):

    # Подсчитываем частоту каждого уникального значения
    value_counts = df[feature].value_counts()

    # Определяем границу отсечения
    top = value_counts.nlargest(2).index

    # Создаем словарь для кодирования
    encoding_dict = {value: i+1 for i, value in enumerate(top)}

    # Функция для кодирования значения
    def encode(value):
        return encoding_dict.get(value, 0)

    # Применяем функцию кодирования к каждому значению
    df[feature] = df[feature].apply(encode)

    return df

class TextEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, feature):
        self.feature = feature

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_encoded = encode_text_values(X, self.feature)
        return X_encoded

# Создаем пайплайн
pipeline = Pipeline([
    ('text_encoder', TextEncoder(feature='category_list')),
    ('fill_na', SimpleImputer(strategy='constant', fill_value=0))
])

train_data = pd.read_csv('datasets\kaggle_startups_train_28062024.csv')

# Применяем пайплайн к данным
df_processed = pipeline.fit_transform(train_data)

df_processed = pd.DataFrame(df_processed, columns = ['name', 'category_list', 'funding_total_usd', 'status', 'country_code',
       'state_code', 'region', 'city', 'funding_rounds', 'founded_at',
       'first_funding_at', 'last_funding_at', 'closed_at'])

df_processed.head(10)

Unnamed: 0,name,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at,closed_at
0,Lunchgate,0,828626.0,operating,CHE,25,Zurich,Zürich,2,2009-10-17,2011-05-01,2014-12-01,0
1,EarLens,0,42935019.0,operating,USA,CA,SF Bay Area,Redwood City,4,2005-01-01,2010-05-04,2014-02-25,0
2,Reviva Pharmaceuticals,2,35456381.0,operating,USA,CA,SF Bay Area,San Jose,3,2006-01-01,2012-08-20,2014-07-02,0
3,Sancilio and Company,0,22250000.0,operating,0,0,0,0,3,2004-01-01,2011-09-01,2014-07-18,0
4,WireTough Cylinders,0,0.0,operating,USA,VA,VA - Other,Bristol,1,2010-05-12,2012-02-01,2012-02-01,0
5,Connected Sports Ventures,0,4300000.0,operating,USA,NJ,Newark,Princeton,1,2011-04-16,2012-11-12,2012-11-12,0
6,Attensity,0,90000000.0,operating,USA,CA,SF Bay Area,Redwood City,1,2000-01-01,2014-05-14,2014-05-14,0
7,Mesh Networks,1,4300000.0,operating,USA,TX,Houston,Houston,1,2005-01-01,2014-11-09,2014-11-09,0
8,AngioScore,2,42000000.0,operating,USA,CA,SF Bay Area,Fremont,2,2003-01-01,2007-10-09,2011-04-20,0
9,Vidatronic,0,1250500.0,operating,USA,TX,Austin,College Station,2,2010-01-01,2011-08-23,2013-03-21,0


In [4]:
test_data = pd.read_csv('datasets\kaggle_startups_test_28062024.csv')

display(test_data.head(10))

Unnamed: 0,name,category_list,funding_total_usd,country_code,state_code,region,city,funding_rounds,first_funding_at,last_funding_at,lifetime
0,Crystalsol,Clean Technology,2819200.0,NIC,17,,,1,2009-07-01,2009-07-01,3501
1,JBI Fish & Wings,Hospitality,,USA,TN,TN - Other,Humboldt,1,2010-07-28,2010-07-28,2717
2,COINPLUS,Finance,428257.0,LUX,3,Esch-sur-alzette,Esch-sur-alzette,2,2014-05-15,2014-09-18,1295
3,Imagine Communications,Software|Video|Video Streaming,34700000.0,USA,CA,San Diego,San Diego,4,2005-01-01,2010-04-20,4748
4,DNA13,Software,4530000.0,CAN,ON,Ottawa,Ottawa,1,2007-05-08,2007-05-08,6209
5,Quickfire Games,Design|Entertainment|Games,160000.0,,,,,2,2013-09-18,2014-09-18,1583
6,Sente Inc.,Biotechnology,26842000.0,USA,CA,San Diego,Encinitas,5,2009-01-31,2014-06-02,4018
7,Triosyn,Health Care|Medical|Therapeutics,4000000.0,,,,,1,2003-06-02,2003-06-02,5576
8,Urgent.ly,Software,8710000.0,USA,VA,"Washington, D.C.",Sterling,3,2014-04-17,2015-09-29,1826
9,Map Decisions,Software,13200.0,USA,PA,Allentown,Bethlehem,1,2013-08-09,2013-08-09,2057
