In [456]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler
import pingouin as pg

sns.set()

This is a big block of code which runs through and picks out each file for the worms, combines the data, takes an average acceleration over 15 frames (~2 seconds) and sorts off min, max, avg, and quartiles. This is compiled into one big dataframe to be hypothesis tested on later.

In [407]:
# Creating the future dataframe
mutant_stats = {}
# Folders to find the files in per mutation
folders = ['REDACTED']

for folder in folders:
    path = '../../data/frame_distances/'
    path += folder
    files = os.listdir(path)
    # Loops through and concatenates all dataframes of each mutation into one dataframe per mutation
    df = pd.read_csv(path+files[0])
    for file in files[1:]:
        df2 = pd.read_csv(path+file)
        df = pd.concat([df, df2], axis = 1)

    # Changing column names so not to have duplicates
    df.columns = range(df.shape[1])

    # Making values greater than 10 Null in case I missed some cleaning before
    df = df.mask(df > 10)

    # Removing worms with too many null values, this is necessary to retain accuracy
    pct_null = df.isnull().sum() / len(df)
    missing_features = pct_null[pct_null > 0.05].index
    df.drop(missing_features, axis=1, inplace = True)

    # Filling the columns with smaller nulls with the value directly above. This does not change the acceleration of the worm
    df.fillna(method = 'ffill', inplace = True)

    # 30 frame rolling average, about 1 second, and removing the nulls that creates
    rolling = df.rolling(15).mean()
    rolling = rolling[14:]

    # Getting acceleration in pixels/frame^2
    acceleration = rolling.copy()/15

    acceleration.to_csv('../../data/accelerations/' + folder.strip('/') + '.csv', index = False)
    
    # Finding the quartiles, and the max, min, avg values of those per worm.
    avg_acc = acceleration.mean(axis=1)

    mutant_stats[folder.strip('/')] = avg_acc

# Making one big Dataframe with a column for each mutation, with all the stats per mutation as values.
mutants = pd.DataFrame(mutant_stats)

In [424]:
mutants.to_csv('../../data/mutants.csv', index = False)

In [415]:
mutants.dropna(inplace = True)

ANOVA testing

In [416]:
stats.f_oneway(mutants.iloc[:,0], mutants.iloc[:,1], mutants.iloc[:,2], mutants.iloc[:,3], mutants.iloc[:,4])

F_onewayResult(statistic=665.2321835604525, pvalue=0.0)

T-Testing all mutant columns against the wild-type.

In [421]:
mutant_stats = {}
for i in ['REDACTED']:
    mutant_stats[i] = [stats.ttest_ind(mutants['worm'], mutants[i])[0], stats.ttest_ind(mutants['worm'], mutants[i])[1]]
    print 

stat_table = pd.DataFrame(mutant_stats, index = ['f-statistic', 'p-value'])

This shows me that there are significant differences between the wild-type and the mutations. 