## Creating a Web Data Project with Jupyter Notebooks
******
###### 1. Read in json file(s) resulting from your work with requests/selenium and beautifulsoup

In [3]:
# Import necessary packages
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

# Read in json files and create dataframes
filename = '2017-03-14.YouTubeMusicVideos.json'
data = pd.read_json(filename)
filename2 = '2017-03-16.YouTubeMusicVideos.json'
data2 = pd.read_json(filename2)

###### 2. Clean the data column by column
- Ensure that addresses (or other text data) are consistent 
- Eliminate string characters from numeric values 
- Exclude redundant data as appropriate

In [14]:
data

Unnamed: 0,Channel,Channel Link,Duration,Title,Uploaded,Video Link,Views
0,ArianaGrandeVevo,https://www.youtube.com/user/ArianaGrandeVevo,3:58,Ariana Grande - Side To Side ft. Nicki Minaj,6,https://www.youtube.com/watch?v=SXiSVQZLje8,834147250
1,Charlie Puth,https://www.youtube.com/user/CharliesVlogs,3:51,Charlie Puth - We Don't Talk Anymore (feat. Se...,7,https://www.youtube.com/watch?v=3AtDnEC4zak,795283436
2,ShawnMendesVEVO,https://www.youtube.com/channel/UC4-TgOSMJHn-L...,4:17,Shawn Mendes - Treat You Better,8,https://www.youtube.com/watch?v=lY2yjAdbvdQ,775810406
3,TheWeekndVEVO,https://www.youtube.com/user/TheWeekndVEVO,4:34,The Weeknd - Starboy (official) ft. Daft Punk,5,https://www.youtube.com/watch?v=34Na4j8AVgA,765014209
4,ArianaGrandeVevo,https://www.youtube.com/user/ArianaGrandeVevo,4:15,Ariana Grande - Into You,9,https://www.youtube.com/watch?v=1ekZEVeXwek,444247331
5,MeghanTrainorVEVO,https://www.youtube.com/user/MeghanTrainorVEVO,3:43,Meghan Trainor - NO,11,https://www.youtube.com/watch?v=cMTAUr3Nm6I,418068430
6,SiaVEVO,https://www.youtube.com/user/SiaVEVO,5:52,Sia - The Greatest,6,https://www.youtube.com/watch?v=GKSRyLdjsPA,392138630
7,marshmello,https://www.youtube.com/channel/UCEdvpU2pFRCVq...,3:20,Marshmello - Alone (Official Music Video),8,https://www.youtube.com/watch?v=ALZHF5UqnU4,342854911
8,MeghanTrainorVEVO,https://www.youtube.com/user/MeghanTrainorVEVO,3:03,Meghan Trainor - Me Too,10,https://www.youtube.com/watch?v=qDRORgoZxZU,334996764
9,Ed Sheeran,https://www.youtube.com/user/EdSheeran,4:24,Ed Sheeran - Shape of You [Official Video],1 month ago,https://www.youtube.com/watch?v=JGwWNGJdvx8,334725587


In [5]:
# Eliminate string characters from numeric data
data['Views'] = data['Views'].str.replace(',','')
data2['Views'] = data2['Views'].str.replace(',','')

data['Uploaded'] = data['Uploaded'].str.replace(' months ago','')
data2['Uploaded'] = data2['Uploaded'].str.replace(' months ago','')

#data['Duration'] = data['Duration'].str.split()

In [7]:
data['Views']

0     834147250
1     795283436
2     775810406
3     765014209
4     444247331
5     418068430
6     392138630
7     342854911
8     334996764
9     334725587
10    332170930
11    287412071
12    230928436
13    208704766
14    203162999
15    198813313
16    195721956
17    191877435
18    189537396
Name: Views, dtype: object

###### 3. Merge dataframes from separate json files as appropriate
- Find the intersection of two (or more) sets
- Compare the intersection with the newer set to find 'New Products'  
    - When found, add the starting date
- Compare the intersection with the older set to find 'Closed Products'  
    - When found, add the closing date
    - Compare closing date with starting date to find days on market      

In [15]:
# Create the inner merge of the two dataframes
dataInnerMerge = pd.merge(data, data2, \
                          on=['Video Link'],\
                          how='inner')

In [None]:
# Determine the set of 'new products'
data2['key1'] = 1
dataInnerMerge['key2'] = 1
newProducts = pd.merge(data2, dataInnerMerge, \
                      on=['ID', 'Title','Location','Price'], \
                      how = 'left')

# Create the associated dataframe
newProducts = newProducts[newProducts.key2 != newProducts.key1]

# Add the starting date in the dataframe in which the product is first shown

In [None]:
# Determine the set of 'closed products'
NoDupsData['key1'] = 1
closedProducts = pd.merge(data, dataInnerMerge, \
                      on=['ID', 'Title','Location','Price'], \
                      how = 'left')

# Create the associated dataframe
closedProducts = closedProducts[closedProducts.key2 != closedProducts.key1]

# Add the closing date in the dataframe in which the product is last shown


# Find days on market by comparing the starting date with the closing date

###### 4. Create visualizations
- Histogram of Prices
- Histogram of Days on Market
- Scatter Diagram of Prices vs Days on Market
- Pie Chart of New, Like New, Used

In [None]:
# Create a histogram of Prices
goodPrices = newProducts.Price[newProducts['Price'] > 1.0]
goodPrices = goodPrices[goodPrices < 1500.0]
plt.hist(goodPrices)
plt.title("Product Prices")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.show()

#Temporary Data for Days on Market ... for Scatter Plot Example
scatterData = {'Price': goodPrices,\
               'daysOnMarket' : \
               np.random.randint(1,30,size=len(goodPrices))}
sD = pd.DataFrame(scatterData)

# Create a histogram of Days on Market
plt.hist(sD.daysOnMarket)
plt.title("Days On Market")
plt.xlabel("Days")
plt.ylabel("Frequency")
plt.show()

# Create a Scatter Diagram of Prices vs Days on Market
fig = plt.figure()
ax = fig.add_subplot(111)
x = sD.Price
y = sD.daysOnMarket
ax.scatter(x,y,color='blue')
ax.set_ylim([0,30])
ax.set_xlim([0,1600])

plt.title("Days On Market vs Price")
plt.xlabel("Price")
plt.ylabel("Days On Market")
plt.show()

# Create a Pie Chart of New, Like New, and Used products
colors = ["#E13F29", "#D69A80", "#D63B59"]
# Create a pie chart
plt.pie(
    [newProducts.Condition[newProducts.Condition == 'like new'].count(),\
     newProducts.Condition[newProducts.Condition == 'new'].count(), \
     newProducts.Condition[newProducts.Condition == 'used'].count()],
    labels= ['like_new','new','used'],
    shadow=True,
    # with colors
    colors=colors,
    # with one slide exploded out
    explode=(0.1, 0, 0),
    # with the start angle at 90%
    startangle=90,
    # with the percent listed as a fraction
    autopct='%1.1f%%',
    )

# View the plot drop above
plt.axis('equal')

# View the plot
plt.tight_layout()
plt.show()



###### 5. Trends from your data  
- Search for specific brands and offer counts for each
- Search for product types and offer counts for each
- For each of the above (and other) give counts for day over day or week over week

In [None]:
# List the products representing specific brands of your choosing
# for each date provided
fenderData = NoDupsData[NoDupsData['Title'].str.contains('fender ')]
yamahaData = NoDupsData[NoDupsData['Title'].str.contains('yamaha ')]
gibsonData = NoDupsData[NoDupsData['Title'].str.contains('gibson ')]
fenderData2 = NoDupsData2[NoDupsData2['Title'].str.contains('fender ')]
yamahaData2 = NoDupsData2[NoDupsData2['Title'].str.contains('yamaha ')]
gibsonData2 = NoDupsData2[NoDupsData2['Title'].str.contains('gibson ')]

# Create a bar chart of products by brand (for the brands chosen)
fenderCount = len(fenderData)
yamahaCount = len(yamahaData)
gibsonCount = len(gibsonData)
fenderCount2 = len(fenderData2)
yamahaCount2 = len(yamahaData2)
gibsonCount2 = len(gibsonData2)

objects = ('Fender','Fender2','', 'Yamaha', 'Yamaha2','', \
           'Gibson', 'Gibson2')
colors = ('green','red','blue','green','red','blue','green','red')
y_pos = np.arange(len(objects))
performance = [fenderCount,fenderCount2,0,yamahaCount,yamahaCount2,\
               0,gibsonCount,gibsonCount2]
fig = plt.figure(figsize=(15,7))
plt.bar(y_pos, performance, align='center',alpha=0.5, color=colors)
plt.xticks(y_pos, objects)
plt.ylabel('Counts')
plt.title('Brands')
 
plt.show()

# List the products of a specific type
guitarData = NoDupsData[NoDupsData['Title'].str.contains('guitar ')]
audioData = NoDupsData[NoDupsData['Title'].str.contains('audio ')]
drumData = NoDupsData[NoDupsData['Title'].str.contains('drum ')]
guitarData2 = NoDupsData2[NoDupsData2['Title'].str.contains('guitar ')]
audioData2 = NoDupsData2[NoDupsData2['Title'].str.contains('audio ')]
drumData2 = NoDupsData2[NoDupsData2['Title'].str.contains('drum ')]


# Create a bar chart of products by specific type
# for each date provided
guitarCount = len(guitarData)
audioCount = len(audioData)
drumCount = len(drumData)
guitarCount2 = len(guitarData2)
audioCount2 = len(audioData2)
drumCount2 = len(drumData2)

objects = ('Guitar','Guitar2','', 'Audio','Audio2','', 'Drum','Drum2')
y_pos = np.arange(len(objects))
performance = [guitarCount,guitarCount2,0,audioCount,audioCount2,\
               0,drumCount,drumCount2]
colors = ('green','red','blue','green','red','blue','green','red') 
plt.bar(y_pos, performance, align='center', alpha=0.5, color=colors)
plt.xticks(y_pos, objects)
plt.ylabel('Counts')
plt.title('Product Type')
 
plt.show()

In [None]:
fenderData[['Link','Title']]

In [None]:
drumData[['Link','Title']]