In [None]:
# import libraries
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [None]:
# read the data
train=pd.read_csv('train0.csv')
test=pd.read_csv('test0.csv')

In [None]:
# remove spaces from the data
train=train.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
test=test.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [None]:
# replace missing values with np.nan
train=train.replace("nan",np.nan)
train=train.replace("?",np.nan)
train=train.replace("None",np.nan)

test=test.replace("nan",np.nan)
test=test.replace("?",np.nan)
test=test.replace("None",np.nan)

In [None]:
# Converting 'Revenue', 'Net_Valuation', 'Share_Price' to numeric values
train['Revenue']=pd.to_numeric(train['Revenue'],errors='coerce')
train['Net_Valuation']=pd.to_numeric(train['Net_Valuation'],errors='coerce')
train['Share_Price']=pd.to_numeric(train['Share_Price'],errors='coerce')

test['Revenue']=pd.to_numeric(test['Revenue'],errors='coerce')
test['Net_Valuation']=pd.to_numeric(test['Net_Valuation'],errors='coerce')
test['Share_Price']=pd.to_numeric(test['Share_Price'],errors='coerce')


In [None]:
train.columns

Index(['File_Name', 'Location', 'Sector', 'Employees', 'Revenue',
       'Net_Valuation', 'Share_Price', 'Company_Background', 'Product'],
      dtype='object')

In [None]:
train.Product.unique()

array(['AI', 'Truck', 'Medical devices', 'antivirus-security', 'Hydro',
       'Organicfood', 'charcoal', 'Kids toys', 'Fuel', 'Gaming hardware',
       'Bike', 'allopathy', 'Solar'], dtype=object)

In [None]:
# # drop File_Name
# train.drop('File_Name',axis=1,inplace=True)

# File_Name = test['File_Name']
# test.drop('File_Name',axis=1,inplace=True)

In [None]:
# unique values in Location
train.Location.append(test.Location).unique().shape

(232,)

In [None]:
# unique values in Sector
train.Sector.append(test.Sector).unique()

array(['Public_sector', nan, 'Private_sector'], dtype=object)

In [None]:
# replace underscore from Sector with space
train.Sector=train.Sector.replace('Public_sector','Public sector')
test.Sector=test.Sector.replace('Public_sector','Public sector')

train.Sector=train.Sector.replace('Private_sector','Private sector')
test.Sector=test.Sector.replace('Private_sector','Private sector')

In [None]:
# fill missing values in Company_Background with empty string
train.Company_Background.fillna('',inplace=True)
test.Company_Background.fillna('',inplace=True)

In [None]:
# concatenate Company_Background with all other columns

# location
idx = train.Location[train.Location.notnull()].index
train.loc[idx,'Company_Background'] = train.loc[idx,'Company_Background'] + ' This Company is based out of ' + train.loc[idx,'Location'] + '.'
# Sector
idx = train.Sector[train.Sector.notnull()].index
train.loc[idx,'Company_Background'] = train.loc[idx,'Company_Background'] + ' This Company is in the ' + train.loc[idx,'Sector'] + '.'
# Employees
idx = train.Employees[train.Employees.notnull()].index
train.Employees = train.Employees.fillna('0')
train['Employees']=train['Employees'].astype(float).astype(int)
train.loc[idx,'Company_Background'] = train.loc[idx,'Company_Background'] + ' This Company has ' + train.loc[idx,'Employees'].astype(str) + '.' + ' employees.'
# Revenue
idx = train.Revenue[train.Revenue.notnull()].index
train.loc[idx,'Company_Background'] = train.loc[idx,'Company_Background'] + ' This Company has a revenue of ' + train.loc[idx,'Revenue'].round(1).astype(str) + ' Milions.'
# Net_Valuation
idx = train.Net_Valuation[train.Net_Valuation.notnull()].index
train.loc[idx,'Company_Background'] = train.loc[idx,'Company_Background'] + ' This Company has a net valuation of ' + train.loc[idx,'Net_Valuation'].round(1).astype(str) + '.'
# Share_Price
idx = train.Share_Price[train.Share_Price.notnull()].index
train.loc[idx,'Company_Background'] = train.loc[idx,'Company_Background'] + ' This Company has a share price of ' + train.loc[idx,'Share_Price'].round(1).astype(str) + '.'

# Test data

# location
idx = test.Location[test.Location.notnull()].index
test.loc[idx,'Company_Background'] = test.loc[idx,'Company_Background'] + ' This Company is based out of ' + test.loc[idx,'Location'] + '.'
# Sector
idx = test.Sector[test.Sector.notnull()].index
test.loc[idx,'Company_Background'] = test.loc[idx,'Company_Background'] + ' This Company is in the ' + test.loc[idx,'Sector'] + '.'
# Employees
idx = test.Employees[test.Employees.notnull()].index
test.Employees = test.Employees.fillna('0')
test['Employees']=test['Employees'].astype(float).astype(int)
test.loc[idx,'Company_Background'] = test.loc[idx,'Company_Background'] + ' This Company has ' + test.loc[idx,'Employees'].astype(str) + ' employees.'
# Revenue
idx = test.Revenue[test.Revenue.notnull()].index
test.loc[idx,'Company_Background'] = test.loc[idx,'Company_Background'] + ' This Company has a revenue of ' + test.loc[idx,'Revenue'].round(1).astype(str) + ' Milions.'
# Net_Valuation
idx = test.Net_Valuation[test.Net_Valuation.notnull()].index
test.loc[idx,'Company_Background'] = test.loc[idx,'Company_Background'] + ' This Company has a net valuation of ' + test.loc[idx,'Net_Valuation'].round(1).astype(str) + '.'
# Share_Price
idx = test.Share_Price[test.Share_Price.notnull()].index
test.loc[idx,'Company_Background'] = test.loc[idx,'Company_Background'] + ' This Company has a share price of ' + test.loc[idx,'Share_Price'].round(1).astype(str) + '.'


In [None]:
train[train.Company_Background==''].shape

(0, 9)

In [None]:
# drop unnecessary columns from train and test
train = train.drop(['Location','Sector','Employees','Revenue','Net_Valuation','Share_Price'],axis=1)
test = test.drop(['Location','Sector','Employees','Revenue','Net_Valuation','Share_Price'],axis=1)

In [None]:
test.Company_Background[0] = 'This Company has a revenue of 139.3 Milions. This Company has a net valuation of 27869.4. This Company has a share price of 214428.8.'

In [None]:
# save preprocessed data
train.to_csv('train1.csv',index=False)
test.to_csv('test1.csv',index=False)