In [1]:
import pandas as pd 
from tabulate import tabulate
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../Data/cw2-dataset/readings.csv");

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136824 entries, 0 to 136823
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           136824 non-null  int64  
 1   value        136824 non-null  float64
 2   location     136824 non-null  object 
 3   sample date  136824 non-null  object 
 4   measure      136824 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 5.2+ MB


In [4]:
locations = df['location'].unique()
header = ['id' , 'location', 'sample date']
header.extend(df['measure'].unique())
dataframe = pd.DataFrame(columns=header)

In [5]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Columns: 109 entries, id to Total dissolved phosphorus
dtypes: object(109)
memory usage: 0.0+ bytes


In [6]:
def d_parse(date):
    months = {
        "Jan" : 1,
        "Feb" : 2,
        "Mar" : 3,
        "Apr" : 4,
        "May" : 5,
        "Jun" : 6,
        "Jul" : 7,
        "Aug" : 8,
        "Sep" : 9,
        "Oct" : 10,
        "Nov" : 11,
        "Dec" : 12 
    }
    d = date.split("-")
    day = int(d[0])
    month = months[d[1]]
    year = int(d[2])
    if year <= 20:
        if year < 10:
            year = str(200) + str(year)
        else:
            year = str(20) + str(year)
    else: 
        year = str(19) + str(year)
    
    dt = tuple((day , month, int(year)))
    return dt

In [7]:
d_parse("07-Nov-05")

(7, 11, 2005)

In [8]:
df = df.groupby(['location', 'sample date', 'measure'], as_index=False).mean()
for location in locations:
    tmp_df = df[df['location'] == location]
    dates = tmp_df['sample date'].unique()
    for date in dates: 
        tmp = tmp_df[tmp_df['sample date'] == date]
        measures = tmp['measure'].tolist()
        vals = tmp['value'].tolist()
        headers = ['location','sample date', 'day' , 'month', 'year']
        headers.extend(measures)
        dt = d_parse(date)
        values = [location , date, dt[0], dt[1], dt[2]]
        values.extend(vals)
        dataframe = dataframe.append({headers[i]: values[i] for i in range(len(headers))}, ignore_index=True).fillna(np.nan)  

In [9]:
dataframe.head()

Unnamed: 0,id,location,sample date,Water temperature,Dissolved oxygen,Ammonium,Nitrites,Nitrates,Orthophosphate-phosphorus,Total phosphorus,...,Inorganic nitrogen,Berilium,Boron,AGOC-3A,Methylosmoline,Chlorodinine,Total dissolved phosphorus,day,month,year
0,,Boonsri,07-Nov-15,,,,,,,,...,,,,0.598,1.783,0.258667,,7.0,11.0,2015.0
1,,Boonsri,09-Sep-15,,,,,,,,...,,,,0.31,0.314333,0.247,,9.0,9.0,2015.0
2,,Boonsri,1-Aug-14,24.733333,6.416667,0.071667,0.041,0.845667,0.107,0.178667,...,,,,,,,,1.0,8.0,2014.0
3,,Boonsri,1-Aug-15,27.666667,,0.056333,0.023,0.936667,0.044667,0.059,...,,,,,,,0.053,1.0,8.0,2015.0
4,,Boonsri,1-Nov-08,13.0,9.123333,0.168,0.025667,1.129667,0.017333,0.066667,...,,,,,,,,1.0,11.0,2008.0


In [10]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3068 entries, 0 to 3067
Columns: 112 entries, id to year
dtypes: float64(110), object(2)
memory usage: 2.6+ MB


In [11]:
dataframe.to_csv("../Data/data.csv", index=False)

In [12]:
_40Percent = (40 * len(dataframe)) / 100.0
df = pd.DataFrame()

In [13]:
for (columnName, columnData) in dataframe.iteritems():
    if(columnData.dtype != object):
        if(_40Percent >= columnData.isna().sum()):
            df[columnName] = columnData
    else:
        df[columnName] = columnData

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3068 entries, 0 to 3067
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   location                     3068 non-null   object 
 1   sample date                  3068 non-null   object 
 2   Water temperature            2692 non-null   float64
 3   Dissolved oxygen             2268 non-null   float64
 4   Ammonium                     2439 non-null   float64
 5   Nitrites                     2441 non-null   float64
 6   Nitrates                     2439 non-null   float64
 7   Orthophosphate-phosphorus    2437 non-null   float64
 8   Total phosphorus             2359 non-null   float64
 9   Calcium                      1861 non-null   float64
 10  Magnesium                    1872 non-null   float64
 11  Chlorides                    1942 non-null   float64
 12  Biochemical Oxygen           2252 non-null   float64
 13  Chemical Oxygen De

In [15]:
def detect_anomilies(df):
    for (columnName, columnData) in df.iteritems():
        mins = []
        maxs = []
        cnts = []
        colN = []
        mens = []
        if(columnData.dtype != object):
            # calculate summary statistics
            data_mean, data_std = np.mean(columnData), np.std(columnData)
            # identify outliers
            cut_off = data_std * 2
            lower, upper = data_mean - cut_off, data_mean + cut_off
            outliers = [x for x in columnData if x < lower or x > upper]
            if columnName == "Water temperature": 
                print(outliers)
            if len(outliers) == 0: 
                mins.append(0)
                maxs.append(0)
            else:
                mins.append(np.min(outliers))
                maxs.append(np.max(outliers))
            cnts.append(len(outliers))
            colN.append(columnName)
            mens.append(data_mean)
                
            headers = ['Col', 'Cnt', 'Min', 'Max', 'Mean']
            title = ""
            tdata = [colN,cnts,mins, maxs, mens]
            printTable(title, tdata, headers)
        

In [16]:
detect_anomilies(df)

[32.0, 36.4, 34.0]


NameError: name 'printTable' is not defined

In [18]:
fdf = pd.DataFrame()
for location in locations:
    loc_df = df[df['location'] == location]
    loc_df = loc_df.fillna(loc_df.mean())
    fdf = pd.concat([loc_df, fdf], ignore_index=True)

In [19]:
fdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3068 entries, 0 to 3067
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   location                     3068 non-null   object 
 1   sample date                  3068 non-null   object 
 2   Water temperature            3068 non-null   float64
 3   Dissolved oxygen             3068 non-null   float64
 4   Ammonium                     3068 non-null   float64
 5   Nitrites                     3068 non-null   float64
 6   Nitrates                     3068 non-null   float64
 7   Orthophosphate-phosphorus    3068 non-null   float64
 8   Total phosphorus             3068 non-null   float64
 9   Calcium                      3068 non-null   float64
 10  Magnesium                    3068 non-null   float64
 11  Chlorides                    3068 non-null   float64
 12  Biochemical Oxygen           3068 non-null   float64
 13  Chemical Oxygen De

In [None]:
fdf.to_csv("../Data/fdata.csv", index=False)

In [None]:
for location in locations:
    loc_df = fdf[fdf['location'] == location]
    print("Report:" , location)
    detect_anomilies(loc_df)
    print('-----------------------------')

In [20]:
fdf.describe()

Unnamed: 0,Water temperature,Dissolved oxygen,Ammonium,Nitrites,Nitrates,Orthophosphate-phosphorus,Total phosphorus,Calcium,Magnesium,Chlorides,Biochemical Oxygen,Chemical Oxygen Demand (Cr),Chemical Oxygen Demand (Mn),day,month,year
count,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0
mean,13.971903,8.928354,0.525757,0.040263,1.591492,0.063216,0.12602,56.299665,19.413736,48.434611,2.980884,23.963132,5.011238,17.313233,6.645046,2009.841591
std,7.693919,1.859258,0.998174,0.061462,0.94993,0.076063,0.133036,13.341634,7.994238,42.01627,2.770242,16.524465,2.325034,7.778813,3.405746,5.51469
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2,0.0,0.0,0.0,0.0,1.0,1.0,1998.0
25%,7.0,7.7825,0.120333,0.02,1.089167,0.027,0.07,50.5,14.528333,27.725,2.0,14.7,3.83,11.0,4.0,2005.0
50%,14.0,8.974518,0.2581,0.028871,1.573333,0.042201,0.099228,53.378694,20.066489,32.96,2.395,23.447537,4.2,18.0,7.0,2011.0
75%,20.5,10.009917,0.403417,0.042,1.844657,0.073,0.13675,61.150187,21.9,58.596233,3.6,29.0275,6.08,24.0,10.0,2015.0
max,36.4,15.96,14.869,2.052,17.3,1.75,3.533333,153.2,77.72,468.3,74.71,431.09,30.8,31.0,12.0,2016.0


In [17]:
def printTable(title, tdata, headers=[]):
    print('\n')
    print(title)    
    print('\n')
    print(tabulate(tdata, headers= headers, tablefmt="pretty"))
    print('\n')

In [22]:
import seaborn as sns
plt.figure(figsize=(16,12))
plt.suptitle('Data ValuesDistribution', fontsize=22)
plt.subplot(221)
g = sns.distplot(fdf[fdf['Dissolved oxygen'] <= 10.009917]['Dissolved oxygen'])
g.set_title("Transaction AmountDistribuition<=1000", fontsize=18)
g.set_xlabel("")
g.set_ylabel("Probability", fontsize=15)
plt.subplot(222)
g1 = sns.distplot(np.log(fdf['Dissolved oxygen']))
g1.set_title("Transaction Amount(Log)Distribuition", fontsize=18)
g1.set_xlabel("")
g1.set_ylabel("Probability", fontsize=15)
plt.show()

ModuleNotFoundError: No module named 'seaborn'