<a href="https://colab.research.google.com/github/ajoneshs/data_demo/blob/main/cobbs_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/colab_data/og_cobbs_data.csv')
print(df.head())

        Date     Time Sample ID Rain in Last 24HR Water Level Water Color  \
0  7/18/2023  8:38 AM    CC-1.1                no           -           -   
1  7/18/2023  8:39 AM    CC-1.2                no           -           -   
2  7/18/2023  8:40 AM    CC-1.3                no           -           -   
3  7/19/2023  8:14 AM    CC-1.1               yes           -           -   
4  7/19/2023  8:15 AM    CC-1.2               yes           -           -   

   Water Temp (C)  DO (mg/L)  Pressure (mmHg)  Conductivity (uS/cm)    pH  \
0            24.8       6.78            760.8                 464.0  7.11   
1            24.7       6.61            760.8                 467.0  7.09   
2            24.7       6.57            760.8                 467.0  7.07   
3            24.0       4.59            761.8                 490.0  6.72   
4            24.1       4.56            761.8                 487.0  6.90   

   NO3-N (mg/L) PO4-P (mg/L) Presence/Absence  \
0           1.0         0

In [4]:
# renaming for convenience
df = df.rename(columns={
  'Date': 'date',
  'Time': 'time',
  'Sample ID': 'id',
  'Rain in Last 24HR': 'rain24',
  'Water Level': 'water_level',
  'Water Color': 'water_color',
  'Water Temp (C)': 'water_temp',
  'DO (mg/L)': 'dox',
  'Pressure (mmHg)': 'pressure',
  'Conductivity (uS/cm)': 'conductivity',
  'pH': 'ph',
  'NO3-N (mg/L)': 'no3n',
  'PO4-P (mg/L)': 'po4p',
  'Presence/Absence': 'pres',
  'RiverWays Colony Count Average (colonies/100mL)': 'col_count'})

print(df.head())

        date     time      id rain24 water_level water_color  water_temp  \
0  7/18/2023  8:38 AM  CC-1.1     no           -           -        24.8   
1  7/18/2023  8:39 AM  CC-1.2     no           -           -        24.7   
2  7/18/2023  8:40 AM  CC-1.3     no           -           -        24.7   
3  7/19/2023  8:14 AM  CC-1.1    yes           -           -        24.0   
4  7/19/2023  8:15 AM  CC-1.2    yes           -           -        24.1   

    dox  pressure  conductivity    ph  no3n  po4p     pres col_count  
0  6.78     760.8         464.0  7.11   1.0  0.22  no data         -  
1  6.61     760.8         467.0  7.09   1.0  0.31  no data         -  
2  6.57     760.8         467.0  7.07   1.0  0.19  no data         -  
3  4.59     761.8         490.0  6.72   1.2  0.81  no data         -  
4  4.56     761.8         487.0  6.90   1.2  0.57  no data         -  


In [5]:
# replacing yes/no with binary
df = df.replace(to_replace=['no', 'yes'], value=[0, 1])
print(df.head())

        date     time      id  rain24 water_level water_color  water_temp  \
0  7/18/2023  8:38 AM  CC-1.1     0.0           -           -        24.8   
1  7/18/2023  8:39 AM  CC-1.2     0.0           -           -        24.7   
2  7/18/2023  8:40 AM  CC-1.3     0.0           -           -        24.7   
3  7/19/2023  8:14 AM  CC-1.1     1.0           -           -        24.0   
4  7/19/2023  8:15 AM  CC-1.2     1.0           -           -        24.1   

    dox  pressure  conductivity    ph  no3n  po4p     pres col_count  
0  6.78     760.8         464.0  7.11   1.0  0.22  no data         -  
1  6.61     760.8         467.0  7.09   1.0  0.31  no data         -  
2  6.57     760.8         467.0  7.07   1.0  0.19  no data         -  
3  4.59     761.8         490.0  6.72   1.2  0.81  no data         -  
4  4.56     761.8         487.0  6.90   1.2  0.57  no data         -  


In [6]:
# fixing missing data
df = df.replace('-', np.nan)
df = df.replace('no data', np.nan)
print(df)

          date     time      id  rain24     water_level water_color  \
0    7/18/2023  8:38 AM  CC-1.1     0.0             NaN         NaN   
1    7/18/2023  8:39 AM  CC-1.2     0.0             NaN         NaN   
2    7/18/2023  8:40 AM  CC-1.3     0.0             NaN         NaN   
3    7/19/2023  8:14 AM  CC-1.1     1.0             NaN         NaN   
4    7/19/2023  8:15 AM  CC-1.2     1.0             NaN         NaN   
..         ...      ...     ...     ...             ...         ...   
139        NaN      NaN  CC-1.2     NaN  4: Normal-high         NaN   
140        NaN      NaN  CC-1.3     NaN  4: Normal-high         NaN   
141        NaN      NaN  CC-1.1     NaN  4: Normal-high         NaN   
142        NaN      NaN  CC-1.2     NaN  4: Normal-high         NaN   
143        NaN      NaN  CC-1.3     NaN  4: Normal-high         NaN   

     water_temp   dox  pressure  conductivity    ph  no3n  po4p pres col_count  
0          24.8  6.78     760.8         464.0  7.11   1.0  0.22  N

In [7]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/colab_data/clean_cobbs_data.csv')