## Data Integration Example

This notebook highlights the process of integrating data from four different data sources (Excel CSV files) into a master file.

#### WORKFLOW:
##### 1) Import Data


In [1]:
import pandas as pd
import datetime as dt

In [2]:
p1 = pd.read_csv('D:/Springboard/Projects/SolarPower/data/raw/Plant_1_Generation_Data.csv')
p2 = pd.read_csv('D:/Springboard/Projects/SolarPower/data/raw/Plant_2_Generation_Data.csv')
w1 = pd.read_csv('D:/Springboard/Projects/SolarPower/data/raw/Plant_1_Weather_Sensor_Data.csv')
w2 = pd.read_csv('D:/Springboard/Projects/SolarPower/data/raw/Plant_2_Weather_Sensor_Data.csv')

##### Get an initial look at the structure and content of the files

In [3]:
p1.head(3)

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD
0,15-05-2020 00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0
1,15-05-2020 00:00,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.0,6183645.0
2,15-05-2020 00:00,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.0,6987759.0


##### Plant ID and Source Key appear to be categorical values. Let's see how many posible values each hold.

In [4]:
print(p1.PLANT_ID.unique())

[4135001]


There is only one ID in this columns. For easier identification lets change the id to 'Plant 1'.

In [5]:
p1['PLANT_ID'] = '1'

In [6]:
sources = p1.SOURCE_KEY.unique()
print(sources)
print('There are '+ str(len(p1.SOURCE_KEY.unique())) + ' values.')

['1BY6WEcLGh8j5v7' '1IF53ai7Xc0U56Y' '3PZuoBAID5Wc2HD' '7JYdWkrLSPkdwr4'
 'McdE0feGgRqW7Ca' 'VHMLBKoKgIrUVDU' 'WRmjgnKYAwPKWDb' 'ZnxXDlPa8U1GXgE'
 'ZoEaEvLYb1n2sOq' 'adLQvlD726eNBSB' 'bvBOhCH3iADSZry' 'iCRJl6heRkivqQ3'
 'ih0vzX44oOqAx2f' 'pkci93gMrogZuBj' 'rGa61gmuvPhdLxV' 'sjndEbLyjtCKgGv'
 'uHbuxQJl8lW7ozc' 'wCURE6d3bPkepu2' 'z9Y9gH1T5YWrNuG' 'zBIq5rxdHJRwDNY'
 'zVJPv84UY57bAof' 'YxYtjZvoooNbGkE']
There are 22 values.


##### There are 22 different values for the source column. These are hard to read. We are going to map this values to something easier to read like 'Source 1', 'Source 2' , etc.    We will store the key:value combination for later reference.

In [7]:
#create dictionary to hold source key mapping
p1_source = {}
idx=1
for source in sources:
    p1_source[source] = 'P1_SRC_' + str(idx)
    idx += 1

In [8]:
p1['SOURCE_KEY'] = p1['SOURCE_KEY'].replace(p1_source)

In [9]:
p1.describe()

Unnamed: 0,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD
count,68778.0,68778.0,68778.0,68778.0
mean,3147.426211,307.802752,3295.968737,6978712.0
std,4036.457169,394.396439,3145.178309,416272.0
min,0.0,0.0,0.0,6183645.0
25%,0.0,0.0,0.0,6512003.0
50%,429.0,41.49375,2658.714286,7146685.0
75%,6366.964286,623.61875,6274.0,7268706.0
max,14471.125,1410.95,9163.0,7846821.0


In [10]:
p1.isnull().sum()

DATE_TIME      0
PLANT_ID       0
SOURCE_KEY     0
DC_POWER       0
AC_POWER       0
DAILY_YIELD    0
TOTAL_YIELD    0
dtype: int64

In [11]:
p1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68778 entries, 0 to 68777
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATE_TIME    68778 non-null  object 
 1   PLANT_ID     68778 non-null  object 
 2   SOURCE_KEY   68778 non-null  object 
 3   DC_POWER     68778 non-null  float64
 4   AC_POWER     68778 non-null  float64
 5   DAILY_YIELD  68778 non-null  float64
 6   TOTAL_YIELD  68778 non-null  float64
dtypes: float64(4), object(3)
memory usage: 3.7+ MB


In [12]:
p1['DATE'] = p1['DATE_TIME'].str.slice(0,10)
p1['TIME'] = p1['DATE_TIME'].str.slice(11,16)

In [13]:
p1.head(3)

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,DATE,TIME
0,15-05-2020 00:00,1,P1_SRC_1,0.0,0.0,0.0,6259559.0,15-05-2020,00:00
1,15-05-2020 00:00,1,P1_SRC_2,0.0,0.0,0.0,6183645.0,15-05-2020,00:00
2,15-05-2020 00:00,1,P1_SRC_3,0.0,0.0,0.0,6987759.0,15-05-2020,00:00


In [14]:
p1['DATE'] = pd.to_datetime(p1['DATE'])
p1.drop('DATE_TIME', axis = 1, inplace=True)

In [15]:
p1.head()

Unnamed: 0,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,DATE,TIME
0,1,P1_SRC_1,0.0,0.0,0.0,6259559.0,2020-05-15,00:00
1,1,P1_SRC_2,0.0,0.0,0.0,6183645.0,2020-05-15,00:00
2,1,P1_SRC_3,0.0,0.0,0.0,6987759.0,2020-05-15,00:00
3,1,P1_SRC_4,0.0,0.0,0.0,7602960.0,2020-05-15,00:00
4,1,P1_SRC_5,0.0,0.0,0.0,7158964.0,2020-05-15,00:00


In [None]:
p1['TIME'] = p1['TIME'] + ':00'
p1['TIME'] = pd.to_datetime(p1['TIME'], format= '%H:%M:%S').dt.time
p1['DATE_TIME'] = ''
for idx in range (len(p1)):
    p1.iloc[idx,8] = pd.datetime.combine(p1.iloc[idx,6],p1.iloc[idx,7])

p1.drop(['DATE','TIME'], axis=1, inplace=True)
p1 = p1[['DATE_TIME', 'PLANT_ID', 'SOURCE_KEY', 'DC_POWER', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD']]

In [None]:
p1.sample(10)

#### We have analized the first file and made some changes to make the information easier to read. We can now apply the same step to the second file.

In [None]:
p2.head(3)

In [None]:
print(p2.PLANT_ID.unique())

In [None]:
p2['PLANT_ID'] = '2'

In [None]:
sources = p2.SOURCE_KEY.unique()
print(sources)
print('There are '+ str(len(p2.SOURCE_KEY.unique())) + ' values.')

In [None]:
#create dictionary to hold source key mapping
p2_source = {}
idx=1
for source in sources:
    p2_source[source] = 'P2_SRC_' + str(idx)
    idx += 1

In [None]:
p2['SOURCE_KEY'] = p2['SOURCE_KEY'].replace(p2_source)

In [None]:
p2.describe()

In [None]:
p2.isnull().sum()

In [None]:
p2.info()

In [None]:
p2['DATE'] = p2['DATE_TIME'].str.slice(0,10)
p2['TIME'] = p2['DATE_TIME'].str.slice(11,16)

In [None]:
p2['DATE'] = pd.to_datetime(p2['DATE'])
p2.drop('DATE_TIME', axis = 1, inplace=True)

In [None]:
p2.head()

In [None]:
p2['TIME'] = p2['TIME'] + ':00'
p2['TIME'] = pd.to_datetime(p2['TIME'], format= '%H:%M:%S').dt.time
p2['DATE_TIME'] = ''
for idx in range (len(p2)):
    p2.iloc[idx,8] = pd.datetime.combine(p2.iloc[idx,6],p2.iloc[idx,7])

p2.drop(['DATE','TIME'], axis=1, inplace=True)
p2 = p2[['DATE_TIME', 'PLANT_ID', 'SOURCE_KEY', 'DC_POWER', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD']]

In [None]:
p2.sample(10)

##### Both Plant Generation files are ready.

In [None]:
w1.head(3)

In [None]:
w1.PLANT_ID.unique()

In [None]:
w1['PLANT_ID'] = 'Plant 1'

In [None]:
w1['SOURCE_KEY'].unique()

In [None]:
w1['SOURCE_KEY'] = 'WX Senson 1'

In [None]:
w1['DATE'] = w1['DATE_TIME'].str.slice(0,10)
w1['TIME'] = w1['DATE_TIME'].str.slice(11,16)
w1['TIME'] = w1['TIME'] + ':00'
w1.drop('DATE_TIME', axis=1, inplace=True)
w1['TIME'] = pd.to_datetime(w1['TIME'], format= '%H:%M:%S').dt.time
w1['DATE_TIME'] = ''
for idx in range (len(w1)):
    w1.iloc[idx,8] = pd.datetime.combine(w1.iloc[idx,6],w1.iloc[idx,7])
w1.drop(['DATE','TIME'], axis=1, inplace=True)
w1= w1[['DATE_TIME', 'PLANT_ID', 'SOURCE_KEY', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']]
w1.head(3)

In [None]:
w2.head(3)

##### It appears we can rename some columns to aid in readeability


In [None]:
w2.PLANT_ID.unique()

In [None]:
w2['PLANT_ID'] = 'Plant 2'

In [None]:
w2['SOURCE_KEY'].unique()

In [None]:
w2['SOURCE_KEY'] = 'WX Senson 2'

In [None]:
w2['DATE'] = w2['DATE_TIME'].str.slice(0,10)
w2['TIME'] = w2['DATE_TIME'].str.slice(10,16)
w2['DATE'] = pd.to_datetime(w2['DATE'])
w2.drop('DATE_TIME', axis=1, inplace=True)
w2= w2[['DATE', 'TIME', 'PLANT_ID', 'SOURCE_KEY', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']]
w2.head(3)

In [None]:
p1.isnull().sum()