## Intro to data engineering example with data from mariadb and csv file

### Load important packages and open a conection to the database

In [None]:
import pandas as pd
from datetime import datetime
import time
import os
from apscheduler.schedulers.background import BackgroundScheduler
import mariadb
import sys

# Connect to MariaDB Platform
#try:
#    conn = mariadb.connect(
#        user="root",
#        password="",
#        host="127.0.0.1",
#        port=3306,
#        database="test"

#    )
#except mariadb.Error as e:
#    print(f"Error connecting to MariaDB Platform: {e}")
#    sys.exit(1)



### Code in charge of the data wrangling operations: load, cleaning, join, aggregate using pandas

In [None]:


print('Data wrangling, the starting time is: %s' % datetime.now())


    # Get Cursor
#db_cursor = conn.cursor()
#db_cursor.execute(
#   "SELECT DepartamentName,DepartmentID FROM department")
#table_rows = db_cursor.fetchall()

#df_depart = pd.DataFrame(table_rows, columns = ["DepartamentName", "DepartmentID"])

df_depart = pd.read_csv('dpt.txt', encoding="ISO-8859-1",
                            sep=',')
df_workers = pd.read_csv('wk.txt', encoding="ISO-8859-1",
                             sep=',')
print(df_workers)
print(df_depart)



## Filtering

In [14]:
df_workers.filter(df_workers["Age"] < 80)
print("After filtering by outlier age")
print(df_workers)


After filtering by outlier age
      Name  DepartmentID  Age
0      Bob            31   27
1   Marcel            33   46
2     Lisa            33   32
3   Thomas            33   25
4      Tim            34   39
5     Josh            34   32
6  Raphael            36   50
7      Dan            36  120


## Aggregating

In [15]:
df_aggregated = df_workers.agg({'Age': ['sum', 'min']})
print("After aggregating age")
print(df_aggregated)


After aggregating age
     Age
sum  371
min   25


## Inner Join

In [16]:

df_inner = pd.merge(df_depart, df_workers, how='inner', on="DepartmentID")
    
print("After Inner join")    
print(df_inner)
    

After Inner join
  DepartamentName  DepartmentID    Name  Age
0           Sales            31     Bob   27
1       Marketing            33  Marcel   46
2       Marketing            33    Lisa   32
3       Marketing            33  Thomas   25
4  HumanResources            34     Tim   39
5  HumanResources            34    Josh   32


## Outer Join

In [17]:
df_outer = pd.merge(df_depart, df_workers, how='outer', on="DepartmentID")
print("After Outer join")    
print(df_outer)


After Outer join
  DepartamentName  DepartmentID     Name    Age
0           Sales            31      Bob   27.0
1       Marketing            33   Marcel   46.0
2       Marketing            33     Lisa   32.0
3       Marketing            33   Thomas   25.0
4  HumanResources            34      Tim   39.0
5  HumanResources            34     Josh   32.0
6     DataScience            35      NaN    NaN
7             NaN            36  Raphael   50.0
8             NaN            36      Dan  120.0


## Left Join

In [18]:
df_left = pd.merge(df_depart, df_workers, how='left', on="DepartmentID")
print("After Left join")        
print(df_left)


After Left join
  DepartamentName  DepartmentID    Name   Age
0           Sales            31     Bob  27.0
1       Marketing            33  Marcel  46.0
2       Marketing            33    Lisa  32.0
3       Marketing            33  Thomas  25.0
4  HumanResources            34     Tim  39.0
5  HumanResources            34    Josh  32.0
6     DataScience            35     NaN   NaN


## Right Join

In [19]:
df_right = pd.merge(df_depart, df_workers, how='right', on="DepartmentID")
print("After right join")        
print(df_right)

After right join
  DepartamentName  DepartmentID     Name  Age
0           Sales            31      Bob   27
1       Marketing            33   Marcel   46
2       Marketing            33     Lisa   32
3       Marketing            33   Thomas   25
4  HumanResources            34      Tim   39
5  HumanResources            34     Josh   32
6             NaN            36  Raphael   50
7             NaN            36      Dan  120


## Grouping

In [20]:
    grouped = df_workers.groupby('DepartmentID')
    print(grouped.get_group(33))
 

     Name  DepartmentID  Age
1  Marcel            33   46
2    Lisa            33   32
3  Thomas            33   25


## Concatenating

In [21]:
one = pd.DataFrame({
         'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Luis'],
         'DepartmentID':['33','26','4','11','34']},
         index=[1,2,3,4,5])
two = pd.DataFrame({
         'Name': ['Martin', 'Brian', 'Daniel', 'Ron', 'Betty'],
         'DepartmentID':['4','32','6','33','16']},
         index=[1,2,3,4,5])
print(pd.concat([one,two]))


     Name DepartmentID
1    Alex           33
2     Amy           26
3   Allen            4
4   Alice           11
5    Luis           34
1  Martin            4
2   Brian           32
3  Daniel            6
4     Ron           33
5   Betty           16


## With scheduler

In [None]:

def move_data():
    print('Data wrangling, the starting time is: %s' % datetime.now())


#    db_cursor = conn.cursor()
#   db_cursor.execute(
#       "SELECT DepartamentName,DepartmentID FROM department")
#   table_rows = db_cursor.fetchall()

#   df_depart = pd.DataFrame(table_rows, columns = ["DepartamentName", "DepartmentID"])

    df_depart = pd.read_csv('dpt.txt', encoding="ISO-8859-1",
                            sep=',')
    df_workers = pd.read_csv('wk.txt', encoding="ISO-8859-1",
                             sep=',')
    print(df_workers)
    print(df_depart)
    print(df_workers.isnull().sum())
    df_workers.filter(df_workers["Age"] < 80)
    print(df_workers)
    df_aggregated = df_workers.agg({'Age': ['sum', 'min']})
    print(df_aggregated)

    df_inner = pd.merge(df_depart, df_workers, how='inner', on="DepartmentID")
    print(df_inner)
    df_outer = pd.merge(df_depart, df_workers, how='outer', on="DepartmentID")
    print(df_outer)
    df_left = pd.merge(df_depart, df_workers, how='left', on="DepartmentID")
    print(df_left)
    df_right = pd.merge(df_depart, df_workers, how='right', on="DepartmentID")
    print(df_right)
    df_right.to_csv('df_right_join.txt', sep='\t', index=False)

    grouped = df_workers.groupby('DepartmentID')
    print(grouped.get_group(33))

    one = pd.DataFrame({
         'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Luis'],
         'DepartmentID':['33','26','4','11','34']},
         index=[1,2,3,4,5])
    two = pd.DataFrame({
         'Name': ['Martin', 'Brian', 'Daniel', 'Ron', 'Betty'],
         'DepartmentID':['4','32','6','33','16']},
         index=[1,2,3,4,5])
    print(pd.concat([one,two]))

if __name__ == '__main__':
    scheduler = BackgroundScheduler()
    moveData = move_data
    print(callable(moveData))
    scheduler.add_job(moveData, 'interval', hours=0.005)
    scheduler.start()
    print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

    try:
        # This is here to simulate application activity (which keeps the main thread alive).
        while True:
            time.sleep(2)
    except (KeyboardInterrupt, SystemExit):
        # Not strictly necessary if daemonic mode is enabled but should be done if possible
        scheduler.shutdown()


True
Press Ctrl+Break to exit
Data wrangling, the starting time is: 2020-09-15 11:25:41.956540
      Name  DepartmentID  Age
0      Bob            31   27
1   Marcel            33   46
2     Lisa            33   32
3   Thomas            33   25
4      Tim            34   39
5     Josh            34   32
6  Raphael            36   50
7      Dan            36  120
  DepartamentName  DepartmentID
0           Sales            31
1       Marketing            33
2  HumanResources            34
3     DataScience            35
Name            0
DepartmentID    0
Age             0
dtype: int64
      Name  DepartmentID  Age
0      Bob            31   27
1   Marcel            33   46
2     Lisa            33   32
3   Thomas            33   25
4      Tim            34   39
5     Josh            34   32
6  Raphael            36   50
7      Dan            36  120
     Age
sum  371
min   25
  DepartamentName  DepartmentID    Name  Age
0           Sales            31     Bob   27
1       Marketing    

Data wrangling, the starting time is: 2020-09-15 11:26:35.961826
      Name  DepartmentID  Age
0      Bob            31   27
1   Marcel            33   46
2     Lisa            33   32
3   Thomas            33   25
4      Tim            34   39
5     Josh            34   32
6  Raphael            36   50
7      Dan            36  120
  DepartamentName  DepartmentID
0           Sales            31
1       Marketing            33
2  HumanResources            34
3     DataScience            35
Name            0
DepartmentID    0
Age             0
dtype: int64
      Name  DepartmentID  Age
0      Bob            31   27
1   Marcel            33   46
2     Lisa            33   32
3   Thomas            33   25
4      Tim            34   39
5     Josh            34   32
6  Raphael            36   50
7      Dan            36  120
     Age
sum  371
min   25
  DepartamentName  DepartmentID    Name  Age
0           Sales            31     Bob   27
1       Marketing            33  Marcel   46
2     

Data wrangling, the starting time is: 2020-09-15 11:27:29.966117
      Name  DepartmentID  Age
0      Bob            31   27
1   Marcel            33   46
2     Lisa            33   32
3   Thomas            33   25
4      Tim            34   39
5     Josh            34   32
6  Raphael            36   50
7      Dan            36  120
  DepartamentName  DepartmentID
0           Sales            31
1       Marketing            33
2  HumanResources            34
3     DataScience            35
Name            0
DepartmentID    0
Age             0
dtype: int64
      Name  DepartmentID  Age
0      Bob            31   27
1   Marcel            33   46
2     Lisa            33   32
3   Thomas            33   25
4      Tim            34   39
5     Josh            34   32
6  Raphael            36   50
7      Dan            36  120
     Age
sum  371
min   25
  DepartamentName  DepartmentID    Name  Age
0           Sales            31     Bob   27
1       Marketing            33  Marcel   46
2     

Data wrangling, the starting time is: 2020-09-15 11:28:23.974671
      Name  DepartmentID  Age
0      Bob            31   27
1   Marcel            33   46
2     Lisa            33   32
3   Thomas            33   25
4      Tim            34   39
5     Josh            34   32
6  Raphael            36   50
7      Dan            36  120
  DepartamentName  DepartmentID
0           Sales            31
1       Marketing            33
2  HumanResources            34
3     DataScience            35
Name            0
DepartmentID    0
Age             0
dtype: int64
      Name  DepartmentID  Age
0      Bob            31   27
1   Marcel            33   46
2     Lisa            33   32
3   Thomas            33   25
4      Tim            34   39
5     Josh            34   32
6  Raphael            36   50
7      Dan            36  120
     Age
sum  371
min   25
  DepartamentName  DepartmentID    Name  Age
0           Sales            31     Bob   27
1       Marketing            33  Marcel   46
2     