In [2]:
import pandas as pd
import os

In [7]:
# Name of CSV file
csv_filename = "companies.csv" 

# Get the current working directory
current_directory = os.getcwd()

# Combine the current directory path with the CSV file name to get the full file path
dataset_path = os.path.join(current_directory, csv_filename)

df = pd.read_csv(dataset_path)

In [8]:
# Drop rows where any of the specified columns contain null values
df = df.dropna(subset=['locality', 'region', 'name'])

print(df.head())

         country  founded                        id      industry  \
0  united states   2023.0             bedrock-black           NaN   
2  united states   1977.0           wh-construction  construction   
4  united states      NaN         gregory-hitchcock  construction   
5  united states      NaN  mission-imaging-services           NaN   
7  united states      NaN      althoff-lawn-service  construction   

                                    linkedin_url   locality  \
0             linkedin.com/company/bedrock-black       lehi   
2           linkedin.com/company/wh-construction      canby   
4         linkedin.com/company/gregory-hitchcock   new york   
5  linkedin.com/company/mission-imaging-services      arden   
7      linkedin.com/company/althoff-lawn-service  watertown   

                        name          region   size           website  
0              bedrock black            utah   1-10  bedrockblack.com  
2  ward-henshaw construction          oregon  11-50  ward-hens

In [9]:
# Remove the columns 'id' and 'country'
df = df.drop(columns=['id', 'country'])

# Rename the columns 'locality' to 'city' and 'region' to 'state'
df.rename(columns={'locality': 'city', 'region': 'state'}, inplace=True)

print(df.head())

   founded      industry                                   linkedin_url  \
0   2023.0           NaN             linkedin.com/company/bedrock-black   
2   1977.0  construction           linkedin.com/company/wh-construction   
4      NaN  construction         linkedin.com/company/gregory-hitchcock   
5      NaN           NaN  linkedin.com/company/mission-imaging-services   
7      NaN  construction      linkedin.com/company/althoff-lawn-service   

        city                       name           state   size  \
0       lehi              bedrock black            utah   1-10   
2      canby  ward-henshaw construction          oregon  11-50   
4   new york          gregory hitchcock        new york  11-50   
5      arden   mission imaging services  north carolina   1-10   
7  watertown       althoff lawn service    south dakota  11-50   

            website  
0  bedrockblack.com  
2  ward-henshaw.com  
4               NaN  
5               NaN  
7               NaN  


In [10]:
# Reorder the columns so that 'name', 'city', and 'state' are the first three
columns_order = ['name', 'city', 'state'] + [col for col in df.columns if col not in ['name', 'city', 'state']]
df = df[columns_order]

print(df.head())

                        name       city           state  founded  \
0              bedrock black       lehi            utah   2023.0   
2  ward-henshaw construction      canby          oregon   1977.0   
4          gregory hitchcock   new york        new york      NaN   
5   mission imaging services      arden  north carolina      NaN   
7       althoff lawn service  watertown    south dakota      NaN   

       industry                                   linkedin_url   size  \
0           NaN             linkedin.com/company/bedrock-black   1-10   
2  construction           linkedin.com/company/wh-construction  11-50   
4  construction         linkedin.com/company/gregory-hitchcock  11-50   
5           NaN  linkedin.com/company/mission-imaging-services   1-10   
7  construction      linkedin.com/company/althoff-lawn-service  11-50   

            website  
0  bedrockblack.com  
2  ward-henshaw.com  
4               NaN  
5               NaN  
7               NaN  


In [11]:
# Capitalize the first letter of each word in the 'name', 'city', and 'state' columns
df['name'] = df['name'].str.title()
df['city'] = df['city'].str.title()
df['state'] = df['state'].str.title()

print(df.head())

                        name       city           state  founded  \
0              Bedrock Black       Lehi            Utah   2023.0   
2  Ward-Henshaw Construction      Canby          Oregon   1977.0   
4          Gregory Hitchcock   New York        New York      NaN   
5   Mission Imaging Services      Arden  North Carolina      NaN   
7       Althoff Lawn Service  Watertown    South Dakota      NaN   

       industry                                   linkedin_url   size  \
0           NaN             linkedin.com/company/bedrock-black   1-10   
2  construction           linkedin.com/company/wh-construction  11-50   
4  construction         linkedin.com/company/gregory-hitchcock  11-50   
5           NaN  linkedin.com/company/mission-imaging-services   1-10   
7  construction      linkedin.com/company/althoff-lawn-service  11-50   

            website  
0  bedrockblack.com  
2  ward-henshaw.com  
4               NaN  
5               NaN  
7               NaN  


In [12]:
# Name of output CSV file
output_csv_filename = "processed-companies.csv" 

# Combine the current directory path with the CSV file name to get the full file path
output_dataset_path = os.path.join(current_directory, output_csv_filename)

# Save the modified dataframe back to CSV
df.to_csv(output_dataset_path, index=False)