In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from functools import reduce
import re
import unicodedata

In [2]:
data = pd.read_csv("companies.csv",encoding='utf-8')

for column in data.columns:
    print(f"Column: {column}, Data Type: {data[column].dtype}")


Column: company_id, Data Type: int64
Column: name, Data Type: object
Column: description, Data Type: object
Column: company_size, Data Type: float64
Column: state, Data Type: object
Column: country, Data Type: object
Column: city, Data Type: object
Column: zip_code, Data Type: object
Column: address, Data Type: object
Column: url, Data Type: object


In [3]:
distinct_states = data['state'].unique()

In [4]:
distinct_states_list = list(distinct_states)

In [5]:
# Mapping of state names to abbreviations
state_mapping = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    'Delhi': 'DL',
    'Karnataka': 'KA',
    'Gyeonggi-do': 'GG',
    'Paris': 'PAR',
    'HI': 'HI',
    'United States': 'US',
    'Telangana': 'TG',
    'Skane County': 'SK',
    'Herts': 'HE',
    'North-Rhine-Westphalia': 'NRW',
    'Dubai': 'DU',
    'CO.': 'CO',
    'Berlin': 'BE',
    'Ile de France': 'IDF',
    'West Midlands': 'WM',
    'Taiwan': 'TW',
    'Mississippi': 'MS',
    'Seoul': 'SO',
    'Israel': 'IL',
    'USA': 'US',
    'Cambs': 'CB',
    'Hamburg': 'HH',
    # Add more mappings as needed
}

# Function to normalize state names to abbreviations
def normalize_state(state_name):
    return state_mapping.get(state_name, state_name)

# Apply the normalization function to the array
data['state'] = data['state'].replace(state_mapping)


In [6]:
distinct_states_list = list(distinct_states)

In [7]:
distinct_states_list

['NY',
 'CA',
 'Texas',
 'Stockholm',
 'NJ',
 'BW',
 'Ohio',
 'NC',
 'New York',
 'London',
 'Hessen',
 'Pennsylvania',
 'Karnataka',
 'MA',
 'North Carolina',
 'Maharashtra',
 'Connecticut',
 'England',
 'WA',
 'Middlesex',
 'ON',
 'VA',
 'Baselstadt',
 'Quebec',
 'New Jersey',
 'IL',
 'MI',
 'California',
 'MN',
 'GA',
 'Cambridgeshire',
 'France',
 'ma',
 'Massachusetts',
 'Indiana',
 'PA',
 'Uttar Pradesh',
 'Wisconsin',
 'Ontario',
 'TX',
 'Utrecht',
 'British Columbia',
 'CT',
 'MD',
 'OH',
 'Ireland',
 'Arizona',
 'Michigan',
 'Noord-Holland',
 'WI',
 'mi',
 'le-de-France',
 'wa',
 'Georgia',
 'Arkansas',
 'il',
 'Illinois',
 'Minnesota',
 'Qubec',
 'Zuid-Holland',
 'Idaho',
 'Washington',
 'Fl',
 'FL',
 'South Holland',
 'Virginia',
 'RI',
 'Montana',
 'Weybridge',
 'md',
 'Berkshire',
 'Georgia ',
 'TN',
 'MO',
 'Baden-Wrttemberg',
 'Bavaria',
 'Zurich',
 'UT',
 'Auvergne-Rhne-Alpes',
 'Geneva',
 'Florida',
 'Kentucky',
 'CO',
 'Iowa',
 'NSW',
 'Tennessee',
 'Colorado',
 'Miss

In [8]:
count_zero_company_id = (data['company_id'] == '0').sum()
count_null_company_id = data['company_id'].isnull().sum()

count_zero_name = (data['name'] == '0').sum()
count_null_name = data['name'].isnull().sum()

count_zero_company_size = (data['company_size'] == '0').sum()
count_null_company_size = data['company_size'].isnull().sum()

count_zero_states = (data['state'] == '0').sum()
count_null_states = data['state'].isnull().sum()

count_zero_country = (data['country'] == '0').sum()
count_null_country = data['country'].isnull().sum()

count_zero_zip_code = (data['zip_code'] == '0').sum()
count_null_zip_code = data['zip_code'].isnull().sum()

count_zero_address = (data['address'] == '0').sum()
count_null_address = data['address'].isnull().sum()

count_zero_url = (data['url'] == '0').sum()
count_null_url = data['url'].isnull().sum()

print("Counts of '0' values:")
print("Company ID:", count_zero_company_id)
print("Name:", count_zero_name)
print("Company Size:", count_zero_company_size)
print("States:", count_zero_states)
print("Country:", count_zero_country)
print("Zip Code:", count_zero_zip_code)
print("Address:", count_zero_address)
print("URL:", count_zero_url)

print("\nCounts of null (missing) values:")
print("Company ID:", count_null_company_id)
print("Name:", count_null_name)
print("Company Size:", count_null_company_size)
print("States:", count_null_states)
print("Country:", count_null_country)
print("Zip Code:", count_null_zip_code)
print("Address:", count_null_address)
print("URL:", count_null_url)

Counts of '0' values:
Company ID: 0
Name: 0
Company Size: 0
States: 0
Country: 0
Zip Code: 0
Address: 0
URL: 0

Counts of null (missing) values:
Company ID: 0
Name: 0
Company Size: 589
States: 2
Country: 0
Zip Code: 0
Address: 2
URL: 0


In [9]:
#  remove the company size column 
data = data.drop('company_size', axis=1)

In [10]:
data1 = data[data['state'] != '0']

In [11]:
data2 = data[data['country'] != '0']

In [12]:
data3 = data[data['zip_code'] != '0']

In [13]:
data4 = data[data['address'] != '0']

In [14]:
data5 = data[data['city']!= '0']

In [15]:
data.count()

company_id     6063
name           6063
description    5999
state          6061
country        6063
city           6061
zip_code       6063
address        6061
url            6063
dtype: int64

In [16]:
# Mode of the state 
mode1 = data1['state'].mode().iloc[0]
mode1

'CA'

In [17]:
# Replace the missing and 0 values with mode
data['state'] = data['state'].replace('0', mode1)
data['state'] = data['state'].fillna(mode1)
data

Unnamed: 0,company_id,name,description,state,country,city,zip_code,address,url
0,1009,IBM,At IBM we do more than work. We create. We cre...,NY,US,Armonk New York,10504,International Business Machines Corp.,https://www.linkedin.com/company/ibm
1,1016,GE HealthCare,Every day millions of people feel the impact o...,CA,US,Chicago,10017,-,https://www.linkedin.com/company/gehealthcare
2,1021,GE Power,GE Power part of GE Vernova is a world energy ...,NY,US,Schenectady,12345,1 River Road,https://www.linkedin.com/company/gepower
3,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,TX,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...
4,1028,Oracle,Were a cloud technology company that provides ...,TX,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle
...,...,...,...,...,...,...,...,...,...
6058,3700144594,BYREDO,Beauty can be many things to many people. Our ...,CA,US,New York,10017,3350 Riverwood Pkwy #1400,https://www.linkedin.com/company/byredo
6059,3700144710,Pros2Plan a division of Spinnaker SCA,Pros2Plan a division of Spinnaker Services LL...,CO,US,Boulder,80303,3350 Riverwood Pkwy #1400,https://www.linkedin.com/company/pros2plan
6060,3700147810,Ascendo Resources,Ascendo Resources is a certified minority owne...,FL,US,Coral Gables,33134,2 Alhambra Plaza,https://www.linkedin.com/company/ascendoresources
6061,3700150295,The Crox Group,The Crox Group Head Quartered in Chicago with ...,IL,US,Lincolnwood,60712,6818 N Lincoln Ave,https://www.linkedin.com/company/the-crox-group


In [18]:
# Mode of the country 
mode2 = data2['country'].mode().iloc[0]

mode2

'US'

In [19]:
# Replace the missing and 0 values with mode
data['country'] = data['country'].replace('0', mode2)
data['country'] = data['country'].fillna(mode2)
data

Unnamed: 0,company_id,name,description,state,country,city,zip_code,address,url
0,1009,IBM,At IBM we do more than work. We create. We cre...,NY,US,Armonk New York,10504,International Business Machines Corp.,https://www.linkedin.com/company/ibm
1,1016,GE HealthCare,Every day millions of people feel the impact o...,CA,US,Chicago,10017,-,https://www.linkedin.com/company/gehealthcare
2,1021,GE Power,GE Power part of GE Vernova is a world energy ...,NY,US,Schenectady,12345,1 River Road,https://www.linkedin.com/company/gepower
3,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,TX,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...
4,1028,Oracle,Were a cloud technology company that provides ...,TX,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle
...,...,...,...,...,...,...,...,...,...
6058,3700144594,BYREDO,Beauty can be many things to many people. Our ...,CA,US,New York,10017,3350 Riverwood Pkwy #1400,https://www.linkedin.com/company/byredo
6059,3700144710,Pros2Plan a division of Spinnaker SCA,Pros2Plan a division of Spinnaker Services LL...,CO,US,Boulder,80303,3350 Riverwood Pkwy #1400,https://www.linkedin.com/company/pros2plan
6060,3700147810,Ascendo Resources,Ascendo Resources is a certified minority owne...,FL,US,Coral Gables,33134,2 Alhambra Plaza,https://www.linkedin.com/company/ascendoresources
6061,3700150295,The Crox Group,The Crox Group Head Quartered in Chicago with ...,IL,US,Lincolnwood,60712,6818 N Lincoln Ave,https://www.linkedin.com/company/the-crox-group


In [20]:
# Mode of the zip code 
mode3 = data3['zip_code'].mode().iloc[0]
mode3

'10017'

In [21]:
# Replace the missing and 0 values with mode
data['zip_code'] = data['zip_code'].replace('0', mode3)
data['zip_code'] = data['zip_code'].fillna(mode3)
data

Unnamed: 0,company_id,name,description,state,country,city,zip_code,address,url
0,1009,IBM,At IBM we do more than work. We create. We cre...,NY,US,Armonk New York,10504,International Business Machines Corp.,https://www.linkedin.com/company/ibm
1,1016,GE HealthCare,Every day millions of people feel the impact o...,CA,US,Chicago,10017,-,https://www.linkedin.com/company/gehealthcare
2,1021,GE Power,GE Power part of GE Vernova is a world energy ...,NY,US,Schenectady,12345,1 River Road,https://www.linkedin.com/company/gepower
3,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,TX,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...
4,1028,Oracle,Were a cloud technology company that provides ...,TX,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle
...,...,...,...,...,...,...,...,...,...
6058,3700144594,BYREDO,Beauty can be many things to many people. Our ...,CA,US,New York,10017,3350 Riverwood Pkwy #1400,https://www.linkedin.com/company/byredo
6059,3700144710,Pros2Plan a division of Spinnaker SCA,Pros2Plan a division of Spinnaker Services LL...,CO,US,Boulder,80303,3350 Riverwood Pkwy #1400,https://www.linkedin.com/company/pros2plan
6060,3700147810,Ascendo Resources,Ascendo Resources is a certified minority owne...,FL,US,Coral Gables,33134,2 Alhambra Plaza,https://www.linkedin.com/company/ascendoresources
6061,3700150295,The Crox Group,The Crox Group Head Quartered in Chicago with ...,IL,US,Lincolnwood,60712,6818 N Lincoln Ave,https://www.linkedin.com/company/the-crox-group


In [22]:
# Mode of the address 
mode4 = data4['address'].mode().iloc[0]
mode4

'3350 Riverwood Pkwy #1400'

In [23]:
# Replace the missing and 0 values with mode
data['address'] = data['address'].replace('0', mode4)
data['address'] = data['address'].fillna(mode4)
data

Unnamed: 0,company_id,name,description,state,country,city,zip_code,address,url
0,1009,IBM,At IBM we do more than work. We create. We cre...,NY,US,Armonk New York,10504,International Business Machines Corp.,https://www.linkedin.com/company/ibm
1,1016,GE HealthCare,Every day millions of people feel the impact o...,CA,US,Chicago,10017,-,https://www.linkedin.com/company/gehealthcare
2,1021,GE Power,GE Power part of GE Vernova is a world energy ...,NY,US,Schenectady,12345,1 River Road,https://www.linkedin.com/company/gepower
3,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,TX,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...
4,1028,Oracle,Were a cloud technology company that provides ...,TX,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle
...,...,...,...,...,...,...,...,...,...
6058,3700144594,BYREDO,Beauty can be many things to many people. Our ...,CA,US,New York,10017,3350 Riverwood Pkwy #1400,https://www.linkedin.com/company/byredo
6059,3700144710,Pros2Plan a division of Spinnaker SCA,Pros2Plan a division of Spinnaker Services LL...,CO,US,Boulder,80303,3350 Riverwood Pkwy #1400,https://www.linkedin.com/company/pros2plan
6060,3700147810,Ascendo Resources,Ascendo Resources is a certified minority owne...,FL,US,Coral Gables,33134,2 Alhambra Plaza,https://www.linkedin.com/company/ascendoresources
6061,3700150295,The Crox Group,The Crox Group Head Quartered in Chicago with ...,IL,US,Lincolnwood,60712,6818 N Lincoln Ave,https://www.linkedin.com/company/the-crox-group


In [24]:
# Mode of the city 
mode5 = data5['city'].mode().iloc[0]
mode5

'New York'

In [25]:
# Replace the missing and 0 values with mode
data['city'] = data['city'].replace('0', mode5)
data['city'] = data['city'].fillna(mode5)

In [26]:
count_zero_company_id = (data['company_id'] == '0').sum()
count_zero_name = (data['name'] == '0').sum()
count_zero_company_size = (data['company_size'] == '0').sum()
count_zero_states = (data['state'] == '0').sum()
count_zero_country = (data['country'] == '0').sum()
count_zero_zip_code = (data['zip_code'] == '0').sum()
count_zero_address = (data['address'] == '0').sum()
count_zero_url = (data['url'] == '0').sum()
print(count_zero_company_id)
print(count_zero_name)
print(count_zero_company_size)

print(count_zero_states)
print(count_zero_country)
print(count_zero_zip_code)
print(count_zero_address)

print(count_zero_url)

KeyError: 'company_size'

In [None]:
print(data.dtypes)

In [None]:
# Remove non ascii values from each column , also remove '\'',  '"'  , ','  , '' and replace with blank ''


def safe_normalize(x):
    if isinstance(x, str):
        return unicodedata.normalize('NFC', x)
    else:
        return str(x)
data['company_id'] = data['company_id'].apply(safe_normalize)
data['company_id'] = data['company_id'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
data = data.replace({'\'': '', '"': '', ',': '', '': ''}, regex=True)
def safe_normalize(x):
    if isinstance(x, str):
        return unicodedata.normalize('NFC', x)
    else:
        return str(x)
data['name'] = data['name'].apply(safe_normalize)
data['name'] = data['name'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
data = data.replace({'\'': '', '"': '', ',': '', '': ''}, regex=True)
def safe_normalize(x):
    if isinstance(x, str):
        return unicodedata.normalize('NFC', x)
    else:
        return str(x)
data['description'] = data['description'].apply(safe_normalize)
data['description'] = data['description'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
data = data.replace({'\'': '', '"': '', ',': '', '': ''}, regex=True)
def safe_normalize(x):
    if isinstance(x, str):
        return unicodedata.normalize('NFC', x)
    else:
        return str(x)
data['company_size'] = data['company_size'].apply(safe_normalize)
data['company_size'] = data['company_size'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
data = data.replace({'\'': '', '"': '', ',': '', '': ''}, regex=True)
def safe_normalize(x):
    if isinstance(x, str):
        return unicodedata.normalize('NFC', x)
    else:
        return str(x)
data['state'] = data['state'].apply(safe_normalize)
data['state'] = data['state'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
data = data.replace({'\'': '', '"': '', ',': '', '': ''}, regex=True)
def safe_normalize(x):
    if isinstance(x, str):
        return unicodedata.normalize('NFC', x)
    else:
        return str(x)
data['country'] = data['country'].apply(safe_normalize)
data['country'] = data['country'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
data = data.replace({'\'': '', '"': '', ',': '', '': ''}, regex=True)
def safe_normalize(x):
    if isinstance(x, str):
        return unicodedata.normalize('NFC', x)
    else:
        return str(x)
data['city'] = data['city'].apply(safe_normalize)
data['city'] = data['city'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
data = data.replace({'\'': '', '"': '', ',': '', '': ''}, regex=True)
def safe_normalize(x):
    if isinstance(x, str):
        return unicodedata.normalize('NFC', x)
    else:
        return str(x)
data['zip_code'] = data['zip_code'].apply(safe_normalize)
data['zip_code'] = data['zip_code'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
data = data.replace({'\'': '', '"': '', ',': '', '': ''}, regex=True)
def safe_normalize(x):
    if isinstance(x, str):
        return unicodedata.normalize('NFC', x)
    else:
        return str(x)
data['address'] = data['address'].apply(safe_normalize)
data['address'] = data['address'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
data = data.replace({'\'': '', '"': '', ',': '', '': ''}, regex=True)
def safe_normalize(x):
    if isinstance(x, str):
        return unicodedata.normalize('NFC', x)
    else:
        return str(x)
data['url'] = data['url'].apply(safe_normalize)
data['url'] = data['url'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
data = data.replace({'\'': '', '"': '', ',': '', '': ''}, regex=True)



In [None]:
data['company_id'] = data['company_id'].astype('int64')
data['name'] = data['name'].astype('string')  # Change to string data type
data['description'] = data['description'].astype('string')  # Change to string data type
data['company_size'] = data['company_size'].astype('float64')
data['state'] = data['state'].astype('string')  # Change to string data type
data['country'] = data['country'].astype('string')  # Change to string data type
data['city'] = data['city'].astype('string')  # Change to string data type
data['zip_code'] = data['zip_code'].astype('string')  # Change to string data type
data['address'] = data['address'].astype('string')  # Change to string data type
data['url'] = data['url'].astype('string')  # Change to string data type

# Check the updated data types
print(data.dtypes)

In [None]:
data.count()

In [None]:
data.to_csv('~/Documents/Preprocessed_Companies.csv',index=False, encoding = 'utf-8')