In [15]:
# Import Dependencies
import pandas as pd
import datetime as dt
from sqlalchemy import create_engine
# import password from config

In [16]:
# Create postgres engine and connect
engine = create_engine("postgresql://postgres:Tashina100!@localhost:5432/ETL_Project")
conn = engine.connect()

In [17]:
# Import raw/uncleaned cvs files
TSLA_data = pd.read_csv("TSLA.csv")
Tweet_data = pd.read_csv("data_elonmusk_before.csv", encoding='iso-8859-1')

## Check to ensure csv files loaded into a dataframe

In [18]:
# Check to ensure Tweet_data.csv imported correctly
Tweet_data.head()


Unnamed: 0,row ID,Tweet,Time,Retweet from,User
0,Row0,@MeltingIce Assuming max acceleration of 2 to ...,9/29/2017 17:39,,elonmusk
1,Row1,RT @SpaceX: BFR is capable of transporting sat...,9/29/2017 10:44,SpaceX,elonmusk
2,Row2,@bigajm Yup :),9/29/2017 10:39,,elonmusk
3,Row3,Part 2 https://t.co/8Fvu57muhM,9/29/2017 9:56,,elonmusk
4,Row4,Fly to most places on Earth in under 30 mins a...,9/29/2017 9:19,,elonmusk


In [19]:
# Check to ensure TSLA_data.csv imported correctly
TSLA_data.head()

Unnamed: 0,Date,Open_Price,High_Price,Low_Price,Close_Price,Adj_Close_Price,Volume
0,1/2/13,35.0,35.450001,34.709999,35.360001,35.360001,1194800
1,1/3/13,35.18,35.450001,34.75,34.77,34.77,742000
2,1/4/13,34.799999,34.799999,33.919998,34.400002,34.400002,674000
3,1/7/13,34.799999,34.799999,33.900002,34.34,34.34,442000
4,1/8/13,34.5,34.5,33.110001,33.68,33.68,1284000


##  Clean TSLA_data

In [42]:
# Check to see what datatypes
TSLA_data.dtypes

Date                object
Open_Price         float64
High_Price         float64
Low_Price          float64
Close_Price        float64
Adj_Close_Price    float64
Volume               int64
dtype: object

In [49]:
# Change Date column from object to datetime object 
TSLA_data["Date"] = pd.to_datetime(TSLA_data["Date"])

In [45]:
# Check to make sure changed took
TSLA_data.dtypes

Date               datetime64[ns]
Open_Price                float64
High_Price                float64
Low_Price                 float64
Close_Price               float64
Adj_Close_Price           float64
Volume                      int64
dtype: object

In [46]:
# Rename columns to not use key PostgresQL keywords
clean_TSLA_data = TSLA_data.rename(columns = {"Open": "open_price", "High":"high_price", "Low":"low_price",
                                               "Close":"close_price","Adj Close":"adj_close_price"})

In [47]:
# Check to verify column names changed
clean_TSLA_data.head()

Unnamed: 0,Date,Open_Price,High_Price,Low_Price,Close_Price,Adj_Close_Price,Volume
0,2013-01-02,35.0,35.450001,34.709999,35.360001,35.360001,1194800
1,2013-01-03,35.18,35.450001,34.75,34.77,34.77,742000
2,2013-01-04,34.799999,34.799999,33.919998,34.400002,34.400002,674000
3,2013-01-07,34.799999,34.799999,33.900002,34.34,34.34,442000
4,2013-01-08,34.5,34.5,33.110001,33.68,33.68,1284000


## Clean Tweet_data

In [34]:
# Check to see what datatypes
Tweet_data.dtypes

row ID                  object
Tweet                   object
Time                    object
Retweet from            object
User_name               object
Date            datetime64[ns]
dtype: object

In [48]:
# Change Time column from object to datetime object 
Tweet_data['Date'] = pd.to_datetime(Tweet_data['Time']).dt.date
Tweet_data["Date"] = pd.to_datetime(Tweet_data["Date"])

In [36]:
# Check to make sure change took
Tweet_data.dtypes

row ID                  object
Tweet                   object
Time                    object
Retweet from            object
User_name               object
Date            datetime64[ns]
dtype: object

In [50]:
# Change NAN values in Retweet from column to Not Retweeted
Tweet_data["Retweet from"].fillna("Not Retweeted", inplace = True)
Tweet_data.head()

Unnamed: 0,row ID,Tweet,Time,Retweet from,User_name,Date
0,Row0,@MeltingIce Assuming max acceleration of 2 to ...,9/29/2017 17:39,Not Retweeted,elonmusk,2017-09-29
1,Row1,RT @SpaceX: BFR is capable of transporting sat...,9/29/2017 10:44,SpaceX,elonmusk,2017-09-29
2,Row2,@bigajm Yup :),9/29/2017 10:39,Not Retweeted,elonmusk,2017-09-29
3,Row3,Part 2 https://t.co/8Fvu57muhM,9/29/2017 9:56,Not Retweeted,elonmusk,2017-09-29
4,Row4,Fly to most places on Earth in under 30 mins a...,9/29/2017 9:19,Not Retweeted,elonmusk,2017-09-29


In [39]:
# Rename columns to not use key PostgresQL keywords
Tweet_data = Tweet_data.rename(columns= {"User": "User_name"})

In [40]:
# Drop row Id column
clean_tweet_data = Tweet_data[["Tweet", "Retweet from", "User_name", "Date"]]

In [41]:
clean_tweet_data.head()

Unnamed: 0,Tweet,Retweet from,User_name,Date
0,@MeltingIce Assuming max acceleration of 2 to ...,Not Retweeted,elonmusk,2017-09-29
1,RT @SpaceX: BFR is capable of transporting sat...,SpaceX,elonmusk,2017-09-29
2,@bigajm Yup :),Not Retweeted,elonmusk,2017-09-29
3,Part 2 https://t.co/8Fvu57muhM,Not Retweeted,elonmusk,2017-09-29
4,Fly to most places on Earth in under 30 mins a...,Not Retweeted,elonmusk,2017-09-29


## Load dataframe to PostgresSQL

In [None]:

clean_TSLA_data.to_sql("stock", con=conn)
clean_tweet_data.to_sql("tweet", con=conn)