In [None]:
import pandas as pd
import numpy as np

# Importing data

In [None]:
social = pd.read_csv('January 3-10, 2018 - Core Trends Survey - CSV.csv')

In [None]:
social

In [None]:
social.head() # print the first 5 (default) lines.

In [None]:
social.tail() # print the last 5 (default) lines.

In [None]:
social.describe()

In [None]:
social.to_csv('copy_of_core_trends_survey.csv') # Save DataFrame to disk.

# Series
## *DataFrames are made of Series are made of single values*
- Each DataFrame is like a *dictionary* of Series of the **same length**.
- Each Series is like a *list* of single values of the **same data type**.

In [None]:
# We can create a Series from a list.
l = [1, 2, 3]
s = pd.Series(l)
print(s)

# From an array.
a = np.array([1,2,3])
s = pd.Series(a, index=['first', 'second', 'third']) # We can optionally add names to the Series index.
print(s)

# And from a dictionary.
d = {'first': 1,
     'second': 2,
     'third': 3}
s = pd.Series(d) # Automatically takes the index names from the dictionary keys.
print(s)

In [None]:
# If we create a DataFrame with Series of different length, the "holes" will be filled with missing values.

df = pd.DataFrame([s, s[:1]]) # From a list (by row).
print(df)

df = pd.DataFrame({'one': s,
                   'two': s[:1]}) # From a dictionary (by column).
print(df)

# Selecting data
## Columns
`df.col` acceps:
- A single column name, as an attribute. Doesn't work with special characters. Can interfere with existing attributes/methods.

`df[col]` accepts:
- A single string. Returns a Series.
- A list of strings of any length (including 1). Returns a DataFrame.

## Rows (and columns)
`df[row]` accepts:
- A boolean array. Returns a DataFrame.

`df.loc[row, col]` accepts:
- A single integer or string.
- A list of integers or strings of any lenght (including 1).
- A list of booleans of the same lenght as the dimension we want to select.

`df.loc[row]` will work with **integers or strings** *depending* on whether the index is made of integers or strings, respectively. Match by **name**.

`df.iloc[row]` will *always* work with **integers**, no matter what's in the index. Match by **position**.

`.loc` and `.iloc` also accept boolean arrays.

In [None]:
# We can get.
var = df[selection]

# And set.
df[selection] = var

# Combining DataFrames

In [None]:
# Concatenate DataFrames by rows or columns.
new_df = pd.concat([df, df, df], axis='columns')
new_df

In [None]:
# Append DataFrames by rows. Unlike list.append(), returns instead of modifying in place.
new_df = new_df.append([new_df, new_df])
new_df

In [None]:
new_df = new_df.reset_index(drop=True)
new_df

In [None]:
sub_1 = social.sample(1000)[['respid', 'sex']]
sub_2 = social.sample(1000)[['respid', 'age']]

merged = sub_1.merge(sub_2, left_on='respid', right_on='respid')
print(merged)

# Reshaping DataFrames

In [None]:
print(merged.shape, merged.shape[0] * 2)

# Melt.
melted = merged.melt(id_vars='respid', value_vars=['sex', 'age'])
print(melted)

# Pivot.
pivoted = melted.pivot(index='respid', columns='variable', values='value').reset_index()
pivoted.columns.name = None
print(pivoted)

In [None]:
melted[melted['respid'] == 101639] # Each id is there twice: once for each variable melted.

# Grouping and aggregating data

In [None]:
# Separate data into subsets.
grouped = merged.groupby('sex') # List-like object that contains 2-tuples with group name and subset data.
for name, subset in grouped:
    print(name)
    print(subset)

In [None]:
grouped = merged.groupby(['sex', 'age']) # List-like object that contains 2-tuples with group name and subset data.
for name, subset in grouped:
    print(name)
    print(subset)

In [None]:
grouped.get_group((1, 18))

In [None]:
grouped = merged.groupby('age')

# Aggregate with specific methods.
print((grouped['sex'].mean() - 1) * 100)

# With the more generic method aggregate, and a function.
print(grouped['sex'].aggregate(lambda x: (np.mean(x) - 1) * 100))

# Or create a new DataFrame with the results 
pct = grouped['sex'].transform(lambda x: (np.mean(x) - 1) * 100)
print(pct)

In [None]:
merged.assign(pct_sex=pct) # Same as merged['pct_sex'] = pct, but returning instead of modifying in place.