In [265]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, Flatten, Dropout, Input
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

## Dataset

In [266]:
train, test = pd.read_csv("train.csv"), pd.read_csv("test.csv")

## EDA

In [267]:
train.head()

Unnamed: 0,Ticker,Company Name,Sector,Industry Group,Industry,Description,Sub-Industry
0,STAF,STAFFING 360 SOLUTIONS INC,Industrials,Commercial & Professional Services,Professional Services,"Staffing 360 Solutions, Inc. is an internation...",Human Resource & Employment Services
1,NXTC,NEXTCURE INC,Health Care,"Pharmaceuticals, Biotechnology & Life Sciences",Biotechnology,"NextCure, Inc. is a clinical-stage biopharmace...",Biotechnology
2,CVBF,CVB FINANCIAL CORP,Financials,Banks,Banks,CVB Financial Corp. is a bank holding company ...,Regional Banks
3,ARCT,ARCTURUS THERAPETCS HOLD INC,Health Care,"Pharmaceuticals, Biotechnology & Life Sciences",Biotechnology,Arcturus Therapeutics Holdings Inc. is a late-...,Biotechnology
4,SMPL,SIMPLY GOOD FOODS COMPANY,Consumer Staples,"Food, Beverage & Tobacco",Food Products,"Simply Good Foods Co is a developer, marketer ...",Packaged Foods & Meats


In [268]:
test.head()

Unnamed: 0,Ticker,Company Name,Sector,Industry Group,Industry,Description
0,PLPC,PREFORMED LINE PRODUCTS CO,Industrials,Capital Goods,Electrical Equipment,"Preformed Line Products Company, together with..."
1,VAPO,VAPOTHERM INC,Health Care,Health Care Equipment & Services,Health Care Equipment & Supplies,"Vapotherm, Inc. is a global medical technology..."
2,SIX,SIX FLAGS ENTERTAINMENT CORP,Consumer Discretionary,Consumer Services,"Hotels, Restaurants & Leisure",Six Flags Entertainment Corporation is a regio...
3,LODE,COMSTOCK MINING INC,Materials,Materials,Metals & Mining,"Comstock Inc., formerly Comstock Mining Inc., ..."
4,MAXR,MAXAR TECHNOLOGIES INC,Industrials,Capital Goods,Aerospace & Defense,"Maxar Technologies, Inc. is a space technology..."


### How many class exists?

In [269]:
number_of_classes = train["Sub-Industry"].unique().shape[0]
# 155
number_of_classes

155

### Check intersection for some columns

In [270]:
# Ticker
set(train.Ticker.unique().tolist()).intersection(set(test.Ticker.unique().tolist()))
# So this column is useless

set()

In [271]:
# Company Name

company_names_train = set()
company_names_train_list = [name.split() for name in  train["Company Name"].unique().tolist()]
for sublist in company_names_train_list:
    for word in sublist:
        company_names_train.add(word)

company_names_test = set()
company_names_test_list = [name.split() for name in  test["Company Name"].unique().tolist()]
for sublist in company_names_test_list:
    for word in sublist:
        company_names_test.add(word)

company_names_train.intersection(company_names_test)

{'&',
 '(THE)',
 '-CL',
 '-LP',
 'A',
 'ACADIA',
 'ACCEPTANCE',
 'ACQ',
 'ADVANCED',
 'AGRICULTURE',
 'AIR',
 'AIRLINES',
 'ALLIED',
 'AMC',
 'AMERICA',
 'AMERICAN',
 'APPAREL',
 'APPLE',
 'APPLIED',
 'ASPEN',
 'ASSET',
 'ASSOCIATED',
 'AUTO',
 'AUTOMOTIVE',
 'AVID',
 'BANCORP',
 'BANCSHARES',
 'BANK',
 'BANKING',
 'BANKSHARES',
 'BASIN',
 'BDC',
 'BEAUTY',
 'BIG',
 'BIO',
 'BIOLOGICS',
 'BIOPHARMA',
 'BIOSCIENCE',
 'BIOSCIENCES',
 'BIOTECH',
 'BIOTECHNOLOGY',
 'BK',
 'BLACK',
 'BOX',
 'BRANDS',
 'BROADCAST',
 'BROTHERS',
 'BROWN',
 'CABLE',
 'CALIFORNIA',
 'CAP',
 'CAPITAL',
 'CARE',
 'CATALYST',
 'CELL',
 'CENTERS',
 'CENTRAL',
 'CENTURY',
 'CHEMICAL',
 'CHENIERE',
 'CHINA',
 'CINCINNATI',
 'CITY',
 'CLEAN',
 'CMNTY',
 'CO',
 'CO/DE',
 'COMMERCE',
 'COMMERCIAL',
 'COMMUNICATIONS',
 'COMMUNITY',
 'COMPANY',
 'COMPASS',
 'COMSTOCK',
 'CONCEPT',
 'CONSOLIDATED',
 'CONSTRUCTION',
 'CORP',
 'CORP/DE',
 'CORP/FL',
 'COS',
 'CP',
 'CTI',
 'CVR',
 'DATA',
 'DELTA',
 'DEVICES',
 'DIAGNOSTICS'

In [272]:
# Sector
set(train["Sector"].unique().tolist()).intersection(set(test["Sector"].unique().tolist()))

{'Communication Services',
 'Consumer Discretionary',
 'Consumer Staples',
 'Energy',
 'Financials',
 'Health Care',
 'Industrials',
 'Information Technology',
 'Materials',
 'Real Estate',
 'Utilities'}

In [273]:
# Industry Group
set(train["Industry Group"].unique().tolist()).intersection(set(test["Industry Group"].unique().tolist()))

{'Automobiles & Components',
 'Banks',
 'Capital Goods',
 'Commercial  & Professional Services',
 'Consumer Durables & Apparel',
 'Consumer Services',
 'Diversified Financials',
 'Energy',
 'Food & Staples Retailing',
 'Food, Beverage & Tobacco',
 'Health Care Equipment & Services',
 'Household & Personal Products',
 'Insurance',
 'Materials',
 'Media & Entertainment',
 'Pharmaceuticals, Biotechnology & Life Sciences',
 'Real Estate',
 'Retailing',
 'Semiconductors & Semiconductor Equipment',
 'Software & Services',
 'Technology Hardware & Equipment',
 'Telecommunication Services',
 'Transportation',
 'Utilities'}

In [274]:
# Industry
set(train["Industry"].unique().tolist()).intersection(set(test["Industry"].unique().tolist()))

{'Aerospace & Defense',
 'Air Freight & Logistics',
 'Airlines',
 'Auto Components',
 'Automobiles',
 'Banks',
 'Beverages',
 'Biotechnology',
 'Building Products',
 'Capital Markets',
 'Chemicals',
 'Commercial Services & Supplies',
 'Communications Equipment',
 'Construction & Engineering',
 'Construction Materials',
 'Consumer Finance',
 'Containers & Packaging',
 'Distributors',
 'Diversified Consumer Services',
 'Diversified Financial Services',
 'Diversified Telecommunication Services',
 'Electric Utilities',
 'Electrical Equipment',
 'Electronic Equipment, Instruments & Components',
 'Energy Equipment & Services',
 'Entertainment',
 'Equity Real Estate \nInvestment Trusts \n(REITs)',
 'Food & Staples Retailing',
 'Food Products',
 'Gas Utilities',
 'Health Care Equipment & Supplies',
 'Health Care Providers & Services',
 'Health Care Technology',
 'Hotels, Restaurants & Leisure',
 'Household Durables',
 'Household Products',
 'IT Services',
 'Independent Power and Renewable Elec

## Cleaning

### Sub-Industry

In [275]:
sub_industry_unique = train["Sub-Industry"].unique().tolist()

In [276]:
numbers_to_string =  { key : value for key,value in enumerate(sub_industry_unique)}
strings_to_number = { key : value for value, key in enumerate(sub_industry_unique)}

In [277]:
y_train = train["Sub-Industry"].apply(lambda x : strings_to_number[x])

### Select columns to join

In [278]:

columns_train = list(train.columns)
columns_train.remove("Sub-Industry")
columns_train.remove("Ticker")

### Create X_train, X_test

In [279]:
X_train_v1 = train[columns_train].apply(lambda x : ' '.join(x.values), axis = 1)

In [280]:
X_test_v1 = test[columns_train].apply(lambda x : ' '.join(x.values), axis = 1)

### Tokenization

In [281]:
NUM_WORDS = 20477
tokenizer = Tokenizer(num_words = NUM_WORDS, oov_token = "OOV")
tokenizer.fit_on_texts(X_train_v1)
# tokenizer.fit_on_texts(X_test_v1)
X_train_v2 = tokenizer.texts_to_sequences(X_train_v1)
X_test_v2 = tokenizer.texts_to_sequences(X_test_v1)

In [282]:
len(list(tokenizer.word_index))

20477

### Padding

Find Maximum length for padding

In [283]:
len_seqs= [len(row) for row in X_train_v2]

In [284]:
max(len_seqs)

232

In [285]:
MAX_LENGTH = 223
# X_train final
X_train = pad_sequences(X_train_v2, padding = "post", truncating = "post", maxlen= MAX_LENGTH)
# X_test final
X_test = pad_sequences(X_test_v2, padding = "post", truncating = "post", maxlen= MAX_LENGTH)

## Model

### Simple Neural Network

In [290]:
SNN_EPOCHS = 200

In [293]:
def build_snn_model():
    model = Sequential()
    model.add(Dense(20,input_dim = X_train.shape[1]))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(16, activation = 'relu'))
    model.add(Dense(number_of_classes, activation = "softmax"))
    model.compile(loss = "sparse_categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
    return model

In [294]:
snn_model = build_snn_model()
snn_model.fit(X_train, y_train, epochs = SNN_EPOCHS)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7f6f72f714e0>

In [None]:
snn_prediction_v1 = snn_model.predict(X_test)



In [None]:
snn_prediction_v2 = []
for i in range(X_test.shape[0]):
    snn_prediction_v2.append(np.argmax(snn_prediction_v1[i]))

In [None]:
snn_prediction_v2[:5]

[1, 1, 1, 1, 1]

In [None]:
snn_prediction_v3 = [ numbers_to_string[num] for num in snn_prediction_v2]

In [None]:
snn_prediction_v3[:5]

['Biotechnology',
 'Biotechnology',
 'Biotechnology',
 'Biotechnology',
 'Biotechnology']

In [None]:
snn_prediction = pd.DataFrame({
    "Sub-Industry" : snn_prediction_v3
})

In [None]:
snn_prediction

Unnamed: 0,Sub-Industry
0,Biotechnology
1,Biotechnology
2,Biotechnology
3,Biotechnology
4,Biotechnology
...,...
939,Biotechnology
940,Biotechnology
941,Biotechnology
942,Biotechnology


### Complicated Neural Network

In [295]:
EMBED_DIM = 150
CNN_EPOCHS = 10

In [296]:
def build_cnn_model():
    model = Sequential()
    model.add(Embedding(NUM_WORDS, EMBED_DIM, input_length = MAX_LENGTH))
    model.add(Conv1D(16,3, activation = 'relu'))
    model.add(MaxPooling1D(3))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(100, activation = 'relu'))
    model.add(Dense(number_of_classes, activation = "softmax"))
    model.compile(loss = "sparse_categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
    return model

In [297]:
cnn_model = build_cnn_model()
cnn_model.fit(X_train, y_train, epochs = CNN_EPOCHS)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6f72f7ebc0>

In [298]:
cnn_prediction_v1 = cnn_model.predict(X_test)



In [299]:
cnn_prediction_v2 = []
for i in range(X_test.shape[0]):
    cnn_prediction_v2.append(np.argmax(cnn_prediction_v1[i]))

In [300]:
cnn_prediction_v2[:5]

[44, 12, 59, 21, 63]

In [301]:
cnn_prediction_v3 = [ numbers_to_string[num] for num in cnn_prediction_v2]

In [302]:
cnn_prediction_v3[:5]

['Electrical Components & Equipment',
 'Health Care Equipment',
 'Environmental & Facilities Services',
 'Property & Casualty Insurance',
 'Aerospace & Defense']

In [304]:
X_test_v1[:5][0]

'PREFORMED LINE PRODUCTS CO Industrials Capital Goods Electrical Equipment Preformed Line Products Company, together with its subsidiaries, is a designer and manufacturer of products and systems employed in the construction and maintenance of overhead, ground-mounted and underground networks for the energy, telecommunication, cable operators, information (data communication), and other similar industries. The Company’s products include Energy Products, which are used to support, protect, terminate and secure both power conductor and fiber communication cables and to control cable dynamics; Communications Products, including protective closures, which are used to protect fixed line communication networks, such as fiber optic cable or copper cable, from moisture, environmental hazards and other potential contaminants, and Special Industries Products, including hardware assemblies, pole line hardware, resale products, underground connectors, solar hardware systems, guy markers, tree guard

In [305]:
X_test_v1[:5][1]

'VAPOTHERM INC Health Care Health Care Equipment & Services Health Care Equipment & Supplies Vapotherm, Inc. is a global medical technology company. The Company is focused on the care of patients of all ages suffering from the respiratory distress often associated with complex lung diseases, such as chronic obstructive pulmonary disease (COPD), congestive heart failure (CHF), pneumonia, asthma and COVID-19. Its device solutions are focused on high velocity nasal insufflation (HVNI), which delivers non-invasive ventilatory support to patients suffering from respiratory distress. Its HVNI technology delivers heated, humidified and oxygenated air at a high velocity through a small-bore nasal interface. It offers four versions of its Precision Flow systems, which includes Precision Flow Hi-VNI, Precision Flow Plus, Precision Flow Classic and Precision Flow Heliox. Its Precision Flow systems include a capital unit, a single-use disposable and a nasal interface. Its digital solutions are foc

In [306]:
X_test_v1[:5][2]

"SIX FLAGS ENTERTAINMENT CORP Consumer Discretionary Consumer Services Hotels, Restaurants & Leisure Six Flags Entertainment Corporation is a regional theme park operator. The Company owns and operates approximately 27 regional theme and water parks. Its parks occupy approximately 6,000 acres of land and is located in geographically diverse markets across North America. Its parks offer a selection of thrill rides, water attractions, themed areas, concerts and shows, restaurants, game venues and retail outlets. Its parks contain approximately 900 rides, including over 140 roller coasters. It offers food, beverages, merchandise and other products and services within its parks. The Company's parks include Six Flags America, Six Flags Discovery Kingdom, Six Flags Fiesta Texas, Six Flags Great Adventure & Safari/ Six Flags Hurricane Harbor, Six Flags Great America, Six Flags Hurricane Harbor, Six Flags Magic Mountain, Six Flags Mexico, Six Flags New England, Six Flags St. Louis, Six Flags W

In [307]:
X_test_v1[:5][3]

"COMSTOCK MINING INC Materials Materials Metals & Mining Comstock Inc., formerly Comstock Mining Inc., is engaged in developing technologies that enable systemic decarbonization and circularity by converting supplies of waste and other under-utilized natural resources into renewable fuels and electrification products that contribute to balancing global uses and emissions of carbon. The Company offers electrification (LiNiCo) and cellulosic fuels (Bioleum) products. It converts wasted and unused biomass feedstock into cellulosic ethanol and drop-in fuels. The Company's technologies are designed to crush, separate, and condition every class of lithium-ion battery feedstock together with their host devices and other electrification materials for client flexibility. Producers of gold and related products, including companies that mine or process gold and the South African finance houses which primarily invest in, but do not operate, gold mines."

In [308]:
X_test_v1[:5][4]

'MAXAR TECHNOLOGIES INC Industrials Capital Goods Aerospace & Defense Maxar Technologies, Inc. is a space technology company that is specializing in manufacturing communication, earth observation, radar, and on-orbit servicing satellites, satellite products, and related services. It delivers disruptive value to government and commercial customers to help them monitor, understand, and navigate changing planet, deliver global broadband communications, and explore and advance the use of space. Its segments include Earth Intelligence and Space Infrastructure. The Earth Intelligence segment is a supplier of high-resolution space-based optical and radar imagery products and analytics. The Space Infrastructure segment is a provider of Space Infrastructure that designs, builds, integrates, and tests solutions for space-based communication satellites, on-orbit servicing, robotic assembly, and space exploration. It also provides geospatial services that combine imagery, analytic expertise, and t

In [None]:
cnn_prediction = pd.DataFrame({
    "Sub-Industry" : cnn_prediction_v3
})

In [None]:
cnn_prediction

Unnamed: 0,Sub-Industry
0,Health Care Equipment
1,Health Care Equipment
2,Asset Management & Custody Banks
3,Health Care Equipment
4,Health Care Equipment
...,...
939,Health Care Equipment
940,Health Care Equipment
941,Regional Banks
942,Health Care Equipment
