# Imports

In [1]:
import pandas as pd
import numpy as np

# 1 - Reading and Exploring Data

## Read Data
- header=0: the row index where the column names are
- sep=',': the column separator
- encoding='utf-8': specify the encoding of the read csv file 

In [2]:
dataFrame = pd.read_csv('all.csv', header=0)

FileNotFoundError: [Errno 2] File b'all.csv' does not exist: b'all.csv'

## Check data

In [None]:
dataFrame.head(6) # no argument ==> first 5 rows (default)

In [None]:
dataFrame.tail() # default is last 5 rows


## Data summary
For Number Fields only

In [None]:
dataFrame.describe()


## DataFrame Properties


In [None]:
dataFrame.columns

In [None]:
dataFrame.columns[0]

In [None]:
dataFrame.index

In [None]:
dataFrame.index[100]

In [None]:
dataFrame.shape # (number of rows, number of columns)

In [None]:
dataFrame.info()

In [None]:
dataFrame.dtypes

## Accessing Data

In [None]:
dataFrame.username[:5] # column `username` as a Series

In [None]:
dataFrame['username'][:5] # another syntax

In [None]:
dataFrame.username.values[:5] # column `username` as am Array/List

# 2 - Column Operations 

In [None]:
dataFrame.tags

## Str Operations

In [None]:
dataFrame.tags.str.split(';')

In [None]:
dataFrame.username.str.lower()

## Functions
These are just some examples. There are way more functions that can be used.

### Mean()

In [None]:
dataFrame.sentiment.mean()

### Unique()

In [None]:
dataFrame.username.unique()

In [None]:
dataFrame.username.nunique() # unique count

### Apply()

In [None]:
dataFrame.sentiment.apply(lambda s: 'Positive' if s > 0 else 'Neutral' if s == 0 else 'Negative')

# 3 - Slicing and Subsetting

## Series

In [None]:
usernames = dataFrame['username']
type(usernames)

In [None]:
usernames[0]

In [None]:
usernames[-1] # doesn't work with negative indexing

## DataFrame

In [None]:
usernames = dataFrame[['username']] # double braces to get a dataframe
type(usernames)

In [None]:
usernames.head()

In [None]:
usernames[0] # doesn't work with normal indexing

## Using location for DataFrame slicing & indexing
This type of slicing/indexing depends on positve indexes and column names.  
\* Note that it also accepts named indexes.

In [None]:
dataFrame.loc[0] # row 0

In [None]:
dataFrame.loc[[0, 1, 100]] # a list of the row indexs that you want

In [None]:
dataFrame.loc[-1] # doesn't work with negative indexes

In [None]:
dataFrame.loc[:5, ['username', 'date']] # provide column names to get their respective data

In [None]:
dataFrame.loc[:5, [1, 2]] # doesn't work with column indexes

## Using integer-location for DataFrame slicing & Indexing
This type of slicing/indexing depends on positve/negative indexes like a normal python array

In [None]:
dataFrame.iloc[0] # row 0

In [None]:
dataFrame.iloc[-1] # last row

In [None]:
dataFrame.iloc[[1, 2, 100]] # list of rows indexes

In [None]:
dataFrame.iloc[:5, [0, 1]] # list of columns indexes

In [None]:
dataFrame.iloc[:5, ['username', 'date']] # doesn't work since it only accepts column indexes

## Conditional Splicing

### Using a Mask
A `mask` is a Series of True/False values for each row reflecting the truth of the conditional rule specified.  
This `mask` is then used to get the True values from the data frame.

In [None]:
mask = dataFrame.sentiment > 0
mask

In [None]:
positive_sentiments = dataFrame[mask]
positive_sentiments.head()

### Using dataFrame.loc[ mask, ... ]
Allows more control on what columns to get

In [None]:
positive_sentiment = dataFrame.loc[dataFrame.sentiment > 0]
positive_sentiment.head()

In [None]:
positive_sentiment = dataFrame.loc[dataFrame.sentiment > 0, 'username']
positive_sentiment.head()

In [None]:
positive_sentiment = dataFrame.loc[dataFrame.sentiment > 0, ['username', 'date']]
positive_sentiment.head()

### Use () to separate conditions, & for AND, | for OR

In [None]:
positive_lebanon24_sentiment = dataFrame.loc[(dataFrame.sentiment > 0) & (dataFrame.username == 'Lebanon 24')]
positive_lebanon24_sentiment.head()

# 4 - Grouping Data using GroupBy

## GroupBy a column

In [None]:
username_groups = dataFrame.groupby('username')
type(username_groups)

## Check the resulted groups

In [None]:
group_dict = username_groups.groups

count = 0
for k, v in group_dict.items():
    if count == 5:
        break
    count += 1
    print(k, ' ==> ', v)

## Get a specifc group

In [None]:
lebanon24_group = username_groups.get_group('Lebanon 24')
lebanon24_group.head()

## Iterate Groups

In [None]:
count = 0
for group_name, group_dataFrame in username_groups:
    if count == 5:
        break
    count += 1
    print(group_name, ' ==> count: ', len(group_dataFrame))

## GroupBy multiple columns

In [None]:
username_location_groups = dataFrame.groupby(['username', 'location'])

In [None]:
group_dict = username_location_groups.groups

count = 0
for k, v in group_dict.items():
    if count == 5:
        break
    count += 1
    print(k, ' ==> ', v)

## Get a specific group from a multiple column groupBy

In [None]:
lebanon24_group = username_location_groups.get_group(('Lebanon 24', 'صور')) # provide a typle of key data to search for
lebanon24_group.head()

## Apply some function on the grouped data

### Series

In [None]:
username_mean_sentiment = dataFrame.groupby(['username'])['sentiment'].mean()
username_mean_sentiment # Series index: username, Series value: mean sentiment

### DataFrame - using reset_index()

In [None]:
username_mean_sentiment = dataFrame.groupby(['username'])['sentiment'].mean().reset_index()
username_mean_sentiment

### using external functions - agg()
Aggregate functions are functions that return only one result at the end

In [None]:
username_mean_sentiment = dataFrame.groupby(['username'])['sentiment'].agg(np.mean).reset_index()
username_mean_sentiment

### using external functions - apply()

In [None]:
username_mean_sentiment = dataFrame.groupby(['username'])['sentiment'].apply(np.mean).reset_index()
username_mean_sentiment

# 5 -  Transforming the DataFrame

In [None]:
df_row_to_col = (dataFrame[['username', 'sentiment']]).copy()
df_row_to_col['label'] = dataFrame.sentiment.apply(lambda s: 'Positive' if s > 0 else 'Neutral' if s == 0 else 'Negative')

## Rows To Columns - pivot_table(index=[ ], columns=[ ], values=[ ])
- index = columns to keep
- columns = columns that provide the labels of the new columns
- values = columns that provide the values for the new columns

In [None]:
df_row_to_col.head()

In [None]:
pivot_table = df_row_to_col.pivot_table(index=['username'], columns='label', values='sentiment').reset_index()
pivot_table.head()

## Columns To Rows - melt(id_vars=[ ], var_name=' ', value_name=' ')
- id_vars = Columns to keep
- var_name = Name of the new variables column
- value_name = Name of the new value column

In [None]:
df_col_to_row = pivot_table.copy()

In [None]:
df_col_to_row.melt(id_vars='username', var_name="label", value_name="sentiment").head()

## Splitting dataFrame column data - dataFrame.str.split()

In [None]:
dataFrame.date.head()

### Series

In [None]:
splitted_data = dataFrame.date.str.split('-')
splitted_data.head()

### DataFrame - expand=True

In [None]:
splitted_data = dataFrame.date.str.split('-', expand=True)
splitted_data.head()

## Dropping a Column - doesn't change original DataFrame

In [None]:
df_after_drop = dataFrame.drop(columns=['username']) # or dataFrame.drop('username', axis='columns')
df_after_drop.columns

## Adding a Column

In [None]:
dataFrame['sentiment_label'] = dataFrame.sentiment.apply(lambda s: 'Positive' if s > 0 else 'Neutral' if s == 0 else 'Negative')
dataFrame.columns[9]

# 6 - External Functions - apply()

## Examples

### 1. print_cols(col)
- Prints all data for each column
- col = dataFrame column

In [None]:
count = 0

def print_cols(col):
    global count 
    
    if count == 5:
        return
    
    count += 1
    
    print('Data for Column [{}]'.format(col.name) )
    print(col) # Series
    print('-----------\n')
    
dataFrame.apply(print_cols)

### 2. print_rows(row, column_names=None)
- prints all rows and all of their columns or specified columns
- row = dataFrame row
- columns_names = names of the column we want to show

In [None]:
count = 0

def print_rows(row, column_names=None):
    global count
    
    if count == 5:
        return
    
    count += 1
    
    if column_names:
        print(row[column_names])
    else:
        print(row)
    print('-----------\n')
    
dataFrame.apply(print_rows, column_names=['username', 'date'], axis='columns')

### 3. vectorized_print(x, y)
- Vectorize the function so that it accepts lists of data and not just individual data
- x = a dataFrame column value
- y = another dataFrame column value

In [None]:
count = 0

@np.vectorize    
def vectorized_print(x, y):
    global count
    
    if count == 5:
        return
    
    count += 1
    
    print(x, ' ==> ', y)
    print('-----------')
    
vectorized_print(dataFrame['username'], dataFrame['date'])