In [2]:
import pandas as pd
import re

In [3]:
df = pd.read_csv("companies.csv")

In [4]:
# Drop rows where any of the specified columns contain null values
df = df.dropna(subset=['locality', 'region', 'name'])

print(df.head())

         country  founded                        id      industry  \
0  united states   2023.0             bedrock-black           NaN   
2  united states   1977.0           wh-construction  construction   
4  united states      NaN         gregory-hitchcock  construction   
5  united states      NaN  mission-imaging-services           NaN   
7  united states      NaN      althoff-lawn-service  construction   

                                    linkedin_url   locality  \
0             linkedin.com/company/bedrock-black       lehi   
2           linkedin.com/company/wh-construction      canby   
4         linkedin.com/company/gregory-hitchcock   new york   
5  linkedin.com/company/mission-imaging-services      arden   
7      linkedin.com/company/althoff-lawn-service  watertown   

                        name          region   size           website  
0              bedrock black            utah   1-10  bedrockblack.com  
2  ward-henshaw construction          oregon  11-50  ward-hens

In [5]:
# Remove the columns 'id' and 'country'
df = df.drop(columns=['id', 'country'])

# Rename the columns 'locality' to 'city' and 'region' to 'state'
df.rename(columns={'locality': 'city', 'region': 'state'}, inplace=True)

print(df.head())

   founded      industry                                   linkedin_url  \
0   2023.0           NaN             linkedin.com/company/bedrock-black   
2   1977.0  construction           linkedin.com/company/wh-construction   
4      NaN  construction         linkedin.com/company/gregory-hitchcock   
5      NaN           NaN  linkedin.com/company/mission-imaging-services   
7      NaN  construction      linkedin.com/company/althoff-lawn-service   

        city                       name           state   size  \
0       lehi              bedrock black            utah   1-10   
2      canby  ward-henshaw construction          oregon  11-50   
4   new york          gregory hitchcock        new york  11-50   
5      arden   mission imaging services  north carolina   1-10   
7  watertown       althoff lawn service    south dakota  11-50   

            website  
0  bedrockblack.com  
2  ward-henshaw.com  
4               NaN  
5               NaN  
7               NaN  


In [6]:
# Reorder the columns so that 'name', 'city', and 'state' are the first three
columns_order = ['name', 'city', 'state'] + [col for col in df.columns if col not in ['name', 'city', 'state']]
df = df[columns_order]

print(df.head())

                        name       city           state  founded  \
0              bedrock black       lehi            utah   2023.0   
2  ward-henshaw construction      canby          oregon   1977.0   
4          gregory hitchcock   new york        new york      NaN   
5   mission imaging services      arden  north carolina      NaN   
7       althoff lawn service  watertown    south dakota      NaN   

       industry                                   linkedin_url   size  \
0           NaN             linkedin.com/company/bedrock-black   1-10   
2  construction           linkedin.com/company/wh-construction  11-50   
4  construction         linkedin.com/company/gregory-hitchcock  11-50   
5           NaN  linkedin.com/company/mission-imaging-services   1-10   
7  construction      linkedin.com/company/althoff-lawn-service  11-50   

            website  
0  bedrockblack.com  
2  ward-henshaw.com  
4               NaN  
5               NaN  
7               NaN  


In [7]:
# Convert 'city' and 'state' columns to lowercase
df['city'] = df['city'].str.lower()
df['state'] = df['state'].str.lower()

print(df.head())

                        name       city           state  founded  \
0              bedrock black       lehi            utah   2023.0   
2  ward-henshaw construction      canby          oregon   1977.0   
4          gregory hitchcock   new york        new york      NaN   
5   mission imaging services      arden  north carolina      NaN   
7       althoff lawn service  watertown    south dakota      NaN   

       industry                                   linkedin_url   size  \
0           NaN             linkedin.com/company/bedrock-black   1-10   
2  construction           linkedin.com/company/wh-construction  11-50   
4  construction         linkedin.com/company/gregory-hitchcock  11-50   
5           NaN  linkedin.com/company/mission-imaging-services   1-10   
7  construction      linkedin.com/company/althoff-lawn-service  11-50   

            website  
0  bedrockblack.com  
2  ward-henshaw.com  
4               NaN  
5               NaN  
7               NaN  


In [8]:
# Full names mapping (Only full names are used here)
valid_states = {
    'alabama', 'alaska', 'arizona', 'arkansas', 'california',
    'colorado', 'connecticut', 'delaware', 'district of columbia', 'florida', 'georgia',
    'hawaii', 'idaho', 'illinois', 'indiana', 'iowa',
    'kansas', 'kentucky', 'louisiana', 'maine', 'maryland',
    'massachusetts', 'michigan', 'minnesota', 'mississippi',
    'missouri', 'montana', 'nebraska', 'nevada', 'new hampshire',
    'new jersey', 'new mexico', 'new york', 'north carolina',
    'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania',
    'rhode island', 'south carolina', 'south dakota', 'tennessee',
    'texas', 'utah', 'vermont', 'virginia', 'washington',
    'west virginia', 'wisconsin', 'wyoming'
}

# Filter the DataFrame to include only rows where the state is in the set of valid states
df = df[df['state'].isin(valid_states)]

print(df.head())

                        name       city           state  founded  \
0              bedrock black       lehi            utah   2023.0   
2  ward-henshaw construction      canby          oregon   1977.0   
4          gregory hitchcock   new york        new york      NaN   
5   mission imaging services      arden  north carolina      NaN   
7       althoff lawn service  watertown    south dakota      NaN   

       industry                                   linkedin_url   size  \
0           NaN             linkedin.com/company/bedrock-black   1-10   
2  construction           linkedin.com/company/wh-construction  11-50   
4  construction         linkedin.com/company/gregory-hitchcock  11-50   
5           NaN  linkedin.com/company/mission-imaging-services   1-10   
7  construction      linkedin.com/company/althoff-lawn-service  11-50   

            website  
0  bedrockblack.com  
2  ward-henshaw.com  
4               NaN  
5               NaN  
7               NaN  


In [9]:
# Function to convert "saint " to "st. " in city names
def convert_saint_to_st(city_name):
    if isinstance(city_name, str):
        # Replace "saint " with "st. " (case-sensitive)
        return city_name.replace("saint ", "st. ")
    return city_name

# Apply the function to the 'city' column
df['city'] = df['city'].apply(convert_saint_to_st)

In [13]:
# Filter out rows where 'name' is not a string or is NaN
# Keep only rows where 'name' is a string and is not NaN
df = df[df['name'].apply(lambda x: isinstance(x, str))]

# Function to clean company names
def clean_company_name(name):
    # Remove commas
    name = name.replace(',', '')
    # Define a regular expression pattern to match unwanted suffixes
    pattern = (
            r'(\AThe\s+|'  # Match 'The' at the start of the string
            r'\s+(Inc\.?|L\.?L\.?C\.?|LP|PLLC|L\.P\.?|P\.C\.?|Ltd\.?|Co.|Corp\.?|Intl\.?|D\.?D\.?S\.?|llp|pc)\b|'
            r'\s+(holdings|holding|u\.s\.a|company|group|international|Corporation|&|incorporated|Co\.?)\s*$)'  # Match these at the end of the string
        )
    # Replace the matched patterns with an empty string
    name = re.sub(pattern, '', name, flags=re.IGNORECASE)
    # Remove any trailing periods
    name = re.sub(r'\.\s*$', '', name)
    return name.strip()

# Apply the cleaning function to the 'name' column
df['name'] = df['name'].apply(clean_company_name)

print(df.head())

                        name       city           state  founded  \
0              bedrock black       lehi            utah     2023   
2  ward-henshaw construction      canby          oregon     1977   
4          gregory hitchcock   new york        new york     <NA>   
5   mission imaging services      arden  north carolina     <NA>   
7       althoff lawn service  watertown    south dakota     <NA>   

       industry                                   linkedin_url   size  \
0           NaN             linkedin.com/company/bedrock-black   1-10   
2  construction           linkedin.com/company/wh-construction  11-50   
4  construction         linkedin.com/company/gregory-hitchcock  11-50   
5           NaN  linkedin.com/company/mission-imaging-services   1-10   
7  construction      linkedin.com/company/althoff-lawn-service  11-50   

            website  
0  bedrockblack.com  
2  ward-henshaw.com  
4               NaN  
5               NaN  
7               NaN  


In [14]:
# Remove duplicate rows based on the combination of 'name', 'city', and 'state'
# keep='first' argument specifies that the first occurrence is kept, others are deleted
df_cleaned = df.drop_duplicates(subset=['name', 'city', 'state'], keep='first')

In [15]:
# Convert 'founded' column to integers, coercing errors to NaN
# This step will convert non-numeric values to NaN and leave existing NaN values as they are
df['founded'] = pd.to_numeric(df['founded'], errors='coerce').astype('Int64')

In [16]:
# Save the filtered DataFrame to a new CSV file
df.to_csv('clean-companies.csv', index=False)

In [8]:
# Generate csv to import into company table in database

# Load the cost of living data
companies_df = pd.read_csv("clean-companies.csv")

# Load city data
city_df = pd.read_csv("city-table.csv")
city_df.rename(columns={'id': 'city_id'}, inplace=True)

# Merge the DataFrames to map city_id based on city and state
merged_df = pd.merge(companies_df, city_df, on=['city', 'state'], how='left')  

# Check for any entries that did not get a city_id (these will need to be addressed)
print(merged_df[merged_df['city_id'].isnull()])

print(merged_df.head())

Empty DataFrame
Columns: [name, city, state, founded, industry, linkedin_url, size, website, city_id]
Index: []
                        name       city           state  founded  \
0              bedrock black       lehi            utah   2023.0   
1  ward-henshaw construction      canby          oregon   1977.0   
2          gregory hitchcock   new york        new york      NaN   
3   mission imaging services      arden  north carolina      NaN   
4       althoff lawn service  watertown    south dakota      NaN   

       industry                                   linkedin_url   size  \
0           NaN             linkedin.com/company/bedrock-black   1-10   
1  construction           linkedin.com/company/wh-construction  11-50   
2  construction         linkedin.com/company/gregory-hitchcock  11-50   
3           NaN  linkedin.com/company/mission-imaging-services   1-10   
4  construction      linkedin.com/company/althoff-lawn-service  11-50   

            website  city_id  
0  bedroc

In [9]:
# Convert 'founded' column to integers, coercing errors to NaN
# This step will convert non-numeric values to NaN and leave existing NaN values as they are
merged_df['founded'] = pd.to_numeric(merged_df['founded'], errors='coerce').astype('Int64')

print(merged_df.head())

                        name       city           state  founded  \
0              bedrock black       lehi            utah     2023   
1  ward-henshaw construction      canby          oregon     1977   
2          gregory hitchcock   new york        new york     <NA>   
3   mission imaging services      arden  north carolina     <NA>   
4       althoff lawn service  watertown    south dakota     <NA>   

       industry                                   linkedin_url   size  \
0           NaN             linkedin.com/company/bedrock-black   1-10   
1  construction           linkedin.com/company/wh-construction  11-50   
2  construction         linkedin.com/company/gregory-hitchcock  11-50   
3           NaN  linkedin.com/company/mission-imaging-services   1-10   
4  construction      linkedin.com/company/althoff-lawn-service  11-50   

            website  city_id  
0  bedrockblack.com    30836  
1  ward-henshaw.com    24894  
2               NaN    20587  
3               NaN    21607

In [10]:
# Reorder columns to make 'city_id' the first column
# Get a list of all columns, then filter out 'city_id'
cols = merged_df.columns.tolist()
cols = ['city_id'] + [col for col in cols if col != 'city_id']

# Reassign reordered columns back to the DataFrame
merged_df = merged_df[cols]

print(merged_df.head())

   city_id                       name       city           state  founded  \
0    30836              bedrock black       lehi            utah     2023   
1    24894  ward-henshaw construction      canby          oregon     1977   
2    20587          gregory hitchcock   new york        new york     <NA>   
3    21607   mission imaging services      arden  north carolina     <NA>   
4    28319       althoff lawn service  watertown    south dakota     <NA>   

       industry                                   linkedin_url   size  \
0           NaN             linkedin.com/company/bedrock-black   1-10   
1  construction           linkedin.com/company/wh-construction  11-50   
2  construction         linkedin.com/company/gregory-hitchcock  11-50   
3           NaN  linkedin.com/company/mission-imaging-services   1-10   
4  construction      linkedin.com/company/althoff-lawn-service  11-50   

            website  
0  bedrockblack.com  
1  ward-henshaw.com  
2               NaN  
3         

In [11]:
# Drop the city and state columns as they are no longer needed
merged_df.drop(columns=['city', 'state'], inplace=True)

print(merged_df.head())

   city_id                       name  founded      industry  \
0    30836              bedrock black     2023           NaN   
1    24894  ward-henshaw construction     1977  construction   
2    20587          gregory hitchcock     <NA>  construction   
3    21607   mission imaging services     <NA>           NaN   
4    28319       althoff lawn service     <NA>  construction   

                                    linkedin_url   size           website  
0             linkedin.com/company/bedrock-black   1-10  bedrockblack.com  
1           linkedin.com/company/wh-construction  11-50  ward-henshaw.com  
2         linkedin.com/company/gregory-hitchcock  11-50               NaN  
3  linkedin.com/company/mission-imaging-services   1-10               NaN  
4      linkedin.com/company/althoff-lawn-service  11-50               NaN  


In [12]:
# Save the updated DataFrame to a new CSV file, ready for database import
merged_df.to_csv('importable-companies.csv', index=False)

In [14]:
# Load the CSV file into a DataFrame
df = pd.read_csv('importable-companies.csv')

# Print the number of rows before removal
original_row_count = df.shape[0]
print("Original number of rows:", original_row_count)

# Remove rows where the 'name' column is null
df_cleaned = df.dropna(subset=['name'])

# Print the number of rows after removal
updated_row_count = df_cleaned.shape[0]
print("Number of rows after removing nulls in 'name':", updated_row_count)

df_cleaned.to_csv('importable-companies.csv', index=False)

Original number of rows: 5920475
Number of rows after removing nulls in 'name': 5920472
