# Imports
There are two major modules used to tidy the dataset:
1. openpyxl
2. pandas

### openpyxl

In order to effectively manipulate XLSX files, the *openpyxl* python module is necessary to extract the relevant columns from the multiple sheets in the SurveyMonthly.xlsx file.

In [1]:
import openpyxl as opxl

### pandas
As a powerful tool learned in DATA 301, the *pandas* python module is necessary to organize columns, ensure unique indices, and merge & join tables.

In [2]:
import pandas as pd

---
# Procedure

### Relevant Information

*Files & Columns*

In [3]:
loadFileDir = '/data/chazalex/' #file directory by which to find data files
CSVfileNames = {'Battery.csv':['participantID', 'date', 'level', 'plugged']}#file name with its relevant columns
XLSXfileNames = {'SurveyMonthly.xlsx':[1, 2, 3, 7, 10]} #file name with its relevant sheet indices
sheetColumns = {0:['Days felt', 'Student confidence', 'Non-student confidence'], 
                1:['I see myself as someone who'], 
                2:['Personal outlook', 'More Agree/Disagree', 'In the past week'], 
                3:['More Agree/Disagree', 'Describe self', 'Variety feelings/behaviors'],
                4:['Fate Opinion']} #the relevant columns from each sheet

saveFileDir = '/data/chazalex/transforms/' #file directory to which to save transformed files

### Functions

In [4]:
def initDataframes(loadFileDir, CSVdict, XLSXdict, sheetCol):
    rawDFlist = XLSXtoDF(loadFileDir, XLSXfileNames)
    xlDFlist = extractColumnsFromRawDFlist(rawDFlist, sheetCol)
    dfList = CSVtoDF(xlDFlist, loadFileDir, CSVfileNames)
    for df in dfList:
        #datetime = pd.to_datetime(df['date'])
        #dt = df['date'].astype(np.dtype('M'))
        #df['time'] = pd.DatetimeIndex(df['date']).time
        df['year'] = pd.DatetimeIndex(df['date']).year
        df['month'] = pd.DatetimeIndex(df['date']).month
        df['day'] = pd.DatetimeIndex(df['date']).day
        df['hour'] = pd.DatetimeIndex(df['date']).hour
        df.drop('date', axis=1, inplace=True)
        #df['date'] = pd.DatetimeIndex(df['date']).date
        #df['time'] = datetime.apply(lambda x: x.time())
        #df['date'] = pd.to_datetime(df['date'])
    return dfList

*XLSX*

In [5]:
def XLSXtoDF(loadFileDir, XLSXdict):
    files = [extractSheets(loadFileDir, key, XLSXdict[key]) for key in XLSXdict.keys()]
    for file in files:
        dfList = [sheetToDF(sheet) for sheet in file]
    return dfList

In [6]:
def extractSheets(loadFileDir, fileName, sheetIdx):
    workbook = opxl.load_workbook(loadFileDir + fileName)
    SNs = workbook.get_sheet_names()
    sheetNamesList = [SNs[i] for i in sheetIdx]
    sheets = [workbook[name] for name in sheetNamesList]
    return sheets

In [7]:
def sheetToDF(sheet):
    data = sheet.values
    cols = next(data)[0:]
    data = list(data)
    df = pd.DataFrame(data, columns=cols)
    return df

In [8]:
def extractColumnsFromRawDFlist(rawDFlist, sheetCol):
    dfList = []
    for key in sheetColumns:
        colList = ['participantID', 'StartDate']
        for substring in sheetColumns[key]:
            [colList.append(col) for col in rawDFlist[key].columns if substring in col]
        dfList.append(pd.DataFrame(rawDFlist[key], columns=colList))
        dfList[key].rename(index=str, columns={'StartDate':'date'}, inplace=True)
    return dfList

*CSV*

In [9]:
def CSVtoDF(dfList, loadFileDir, CSVdict):
    [dfList.append(pd.read_csv(loadFileDir + fileName, usecols=CSVdict[fileName])) for fileName in CSVdict]
    #usecols=CSVdict[fileName]
    return dfList

*Transform*

*Display*

In [10]:
def displayDFinfo(DFdict):
    for (fileName, df) in saveFileDict.items():
        print('====================================================')
        print(fileName)
        print('====================================================')
        print(df.info())

### Steps

*Intialize Dataframes*

In [11]:
dfList = initDataframes(loadFileDir, CSVfileNames, XLSXfileNames, sheetColumns)

  warn(msg)


*Transform Dataframes*

Initialized

In [12]:
spPercept = pd.concat([dfList[0].drop(['year', 'month', 'day', 'hour'], axis=1), 
                       dfList[1].drop(['year', 'month', 'day', 'hour'], axis=1), 
                       dfList[2]], axis=1)
faSpPercept = pd.concat([dfList[3], dfList[4]], axis=1)
battery = dfList[-1]

Averaged

In [13]:
batteryPartIDAvg = battery.groupby('participantID', as_index=False)['level','plugged'].mean()
batteryDateAvg = battery.groupby(['year', 'month', 'day'], as_index=False).mean()

Merged

In [14]:
batMergeSpDate = pd.merge(batteryDateAvg.drop('hour', axis=1), spPercept, on=['year', 'month', 'day'])

*Save Dataframes*

In [15]:
saveFileDict = {'spPerceptions.csv':spPercept, 'faSpPerceptions.csv':faSpPercept, 'battery.csv':battery,
                'batteryPartIDAvg.csv':batteryPartIDAvg, 'batteryDateAvg.csv':batteryDateAvg, 
                'batMergeSpDate.csv':batMergeSpDate}

In [16]:
for (fileName, df) in saveFileDict.items():
    df.to_csv('/data/chazalex/transforms/'+fileName, index=False)

---
# Summary

In [17]:
displayDFinfo(saveFileDict)

spPerceptions.csv
<class 'pandas.core.frame.DataFrame'>
Index: 57 entries, 0 to 9
Columns: 130 entries, participantID to hour
dtypes: float64(9), object(121)
memory usage: 58.3+ KB
None
faSpPerceptions.csv
<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 0 to 99
Data columns (total 54 columns):
participantID                                                                                                                           120 non-null object
More Agree/Disagree In most ways my life is close to my ideal.                                                                          120 non-null object
More Agree/Disagree The conditions in my life are excellent.                                                                            120 non-null object
More Agree/Disagree I am satisfied with my life.                                                                                        119 non-null object
More Agree/Disagree So far I have gotten the important things I want in

---
<!--NAVIGATION-->
< [Raw Dataset](https://data301.calpolydatascience.org/user/chazalex/notebooks/finalproj/notebook/02-Raw.ipynb) | [Data Analysis](https://data301.calpolydatascience.org/user/chazalex/notebooks/finalproj/notebook/04-EDA.ipynb) >