# In this notebook we'll rebuild our labels dataset. We will parse it and collect to a new representation form.

In [2]:
import numpy as np
import pandas as pd

In [4]:
cars = pd.read_csv("../data/source/original_cars_brands_and_models.csv")

In [5]:
cars.iloc[1]

Марка авто    NaN
Модель        MDX
Name: 1, dtype: object

The dataset consists of two columns: the brand of the car and the model. Following the specification of the car brand, the column then contains None values, while in the second column, the model names are presented in parallel. When the models for a particular brand are exhausted, a new brand is indicated, and the process repeats.

In [6]:
cars.replace(np.nan, None, inplace=True)

In [7]:
def collect_models(dataset: pd.DataFrame):
    names = {}
    models = []
    i = 0
    last_model = dataset.iat[0, 0]
    while i < dataset.shape[0]:
        brand, model = dataset.iloc[i]


        # continue
        if not brand: # None

            models.append(model)
        else:
            names[last_model] = models
            last_model = dataset.iat[i, 0]
            models = []
        i+=1
        # print(models)
    return names

In [8]:
collect_models(cars)

{'Acura': ['MDX'],
 'Alfa Romeo': ['Giulietta'],
 'Audi': ['80',
  '100',
  'A1',
  'A3',
  'A4',
  'A5',
  'A6',
  'A7',
  'A8',
  'Q3',
  'Q5',
  'Q7',
  'R8',
  'RS6',
  'S5',
  'S8',
  'TT'],
 'BAW': ['FENIX'],
 'BAZ': ['A079 Etalon'],
 'BMW': ['1 Series',
  '3 Series',
  '4 Series',
  '5 Series',
  '6 Series',
  '7 Series',
  'X1',
  'X3',
  'X4',
  'X5',
  'X6',
  'Z3',
  'Z4'],
 'Brilliance': ['V5'],
 'BYD': ['F3'],
 'Cadillac': ['CTS', 'Escalade', 'SRX'],
 'Carrus': ['Star'],
 'Chery': ['Amulet A15', 'Fora', 'Kimo S12', 'Tiggo', 'Tiggo 5', 'Tiggo T11'],
 'Chevrolet': ['Aveo',
  'Blazer',
  'Camaro',
  'Caprice',
  'Captiva',
  'Cobalt',
  'Cruze',
  'Epica',
  'Express',
  'Lacetti',
  'Lanos',
  'Niva',
  'Orlando',
  'Rezzo',
  'Spark',
  'Suburban',
  'Tahoe',
  'TrailBlazer'],
 'Chrysler': ['300',
  '300С',
  'Grand Voyager',
  'PT Cruiser',
  'Pacifica',
  'Sebring',
  'Town and Country',
  'Voyager'],
 'Citroen': ['BX',
  'Berlingo',
  'C1',
  'C2 C3',
  'C3 Picasso',
  '

In [9]:
df = pd.DataFrame.from_dict(collect_models(cars), orient='index').transpose()

In [10]:
df

Unnamed: 0,Acura,Alfa Romeo,Audi,BAW,BAZ,BMW,Brilliance,BYD,Cadillac,Carrus,...,Suzuki,Toyota,UAZ,Van Hool,VAZ Lada,VIS,Volkswagen,Volvo,Vortex,ZAZ
0,MDX,Giulietta,80,FENIX,A079 Etalon,1 Series,V5,F3,CTS,Star,...,Escudo,4Runner,4320,T815,2101,2347,Amarok,740,Tingo,968
1,,,100,,,3 Series,,,Escalade,,...,Grand Vitara,Allion,5557,,2102,2349,Bora,850,,11055
2,,,A1,,,4 Series,,,SRX,,...,Ignis,Alphard,,,2103,234700,Caddy,940,,1102 Tavriya
3,,,A3,,,5 Series,,,,,...,Jimny,Altezza,,,2104,234900,Caravelle,240 Series,,1103 Slavuta
4,,,A4,,,6 Series,,,,,...,Liana,Auris,,,2105,,Crafter,F12,,Chance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,,,,,,,,,,,...,,Vitz,,,,,,,,
76,,,,,,,,,,,...,,Windom,,,,,,,,
77,,,,,,,,,,,...,,Wish,,,,,,,,
78,,,,,,,,,,,...,,Yaris,,,,,,,,


Example:

In [11]:
bmw = list(df['BMW'][df['BMW'].notnull()])

In [12]:
pd.DataFrame.from_dict(collect_models(cars), orient='index')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
Acura,MDX,,,,,,,,,,...,,,,,,,,,,
Alfa Romeo,Giulietta,,,,,,,,,,...,,,,,,,,,,
Audi,80,100,A1,A3,A4,A5,A6,A7,A8,Q3,...,,,,,,,,,,
BAW,FENIX,,,,,,,,,,...,,,,,,,,,,
BAZ,A079 Etalon,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VIS,2347,2349,234700,234900,,,,,,,...,,,,,,,,,,
Volkswagen,Amarok,Bora,Caddy,Caravelle,Crafter,Golf,Golf Plus,Jetta,LT,Multivan,...,,,,,,,,,,
Volvo,740,850,940,240 Series,F12,FE,FH,FL,FM,FMX,...,,,,,,,,,,
Vortex,Tingo,,,,,,,,,,...,,,,,,,,,,


In [13]:
bmw

['1 Series',
 '3 Series',
 '4 Series',
 '5 Series',
 '6 Series',
 '7 Series',
 'X1',
 'X3',
 'X4',
 'X5',
 'X6',
 'Z3',
 'Z4']

In [15]:
df.to_csv("../data/source/processed_original_models.csv", index=False)

In [16]:
pd.read_csv("../data/source/processed_original_models.csv")

Unnamed: 0,Acura,Alfa Romeo,Audi,BAW,BAZ,BMW,Brilliance,BYD,Cadillac,Carrus,...,Suzuki,Toyota,UAZ,Van Hool,VAZ Lada,VIS,Volkswagen,Volvo,Vortex,ZAZ
0,MDX,Giulietta,80,FENIX,A079 Etalon,1 Series,V5,F3,CTS,Star,...,Escudo,4Runner,4320.0,T815,2101,2347.0,Amarok,740,Tingo,968
1,,,100,,,3 Series,,,Escalade,,...,Grand Vitara,Allion,5557.0,,2102,2349.0,Bora,850,,11055
2,,,A1,,,4 Series,,,SRX,,...,Ignis,Alphard,,,2103,234700.0,Caddy,940,,1102 Tavriya
3,,,A3,,,5 Series,,,,,...,Jimny,Altezza,,,2104,234900.0,Caravelle,240 Series,,1103 Slavuta
4,,,A4,,,6 Series,,,,,...,Liana,Auris,,,2105,,Crafter,F12,,Chance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,,,,,,,,,,,...,,Vitz,,,,,,,,
76,,,,,,,,,,,...,,Windom,,,,,,,,
77,,,,,,,,,,,...,,Wish,,,,,,,,
78,,,,,,,,,,,...,,Yaris,,,,,,,,
