# Scraping iPhones Data 

## Description
The objective of this project is to scrape iPhones data from wikipedia and exporting to csv file.



In [None]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
%matplotlib inline

## Scraping data

In [2]:
url = 'https://en.wikipedia.org/wiki/IPhone'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# To Preview html in another file
with open('soup.txt','w') as f:
    f.write(str(soup))

### Finding our target table

In [3]:
table = soup.find('table', attrs={'class' : "wikitable", 'style':'font-size:88%; margin-left:auto; margin-right:auto; width:99%'})
body = table.find('tbody')

### Building our data frame

In [4]:
models = []
os = []
price = []
rows = body.find_all('tr')
for row in rows:
    models.append(row.find('th').text.strip())
    try:
        os.append(row.find_all('td')[0].text.strip())
    except:
        os.append(np.nan)
    try:
        price.append(row.find_all('td')[-1].text.strip())
    except:
        price.append(np.nan)

In [5]:
df = pd.DataFrame(data={'model':models, 'os':os, 'price':price})
df

Unnamed: 0,model,os,price
0,model,,
1,with OS,,
2,max,,
3,iPhone,iPhone OS 1.0,$499/$599*
4,iPhone 3G,iPhone OS 2.0,$199/$299*$599/$699
5,iPhone 3GS,iPhone OS 3.0,1 year
6,iPhone 4,iOS 4.0,iOS 7.1.2
7,iPhone 4S,iOS 5.0,$199/$299/$399*$649/$749/$849
8,iPhone 5,iOS 6.0,4 years
9,iPhone 5C,iOS 7.0,$99/$199*$549/$649


## Data Cleaning

### Selecting only iPhones' rows

In [6]:
df = df.iloc[3:26]
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,model,os,price
18,iPhone SE (2nd),iOS 13.4,$399/$449/$549
19,iPhone 12 / 12 Mini,iOS 14.1 (12)\niOS 14.2 (12 Mini),$829/$879/$979**\nMini: $729/$779/$879**
20,iPhone 12 Pro / 12 Pro Max,iOS 14.1 (12 Pro)\niOS 14.2 (12 Pro Max),$999/$1099/$1299\nMax: $1099/$1199/$1399
21,iPhone 13 / 13 Mini,iOS 15.0,$829/$929/$1129**\nMini: $729/$829/$1029**
22,iPhone 13 Pro / 13 Pro Max,iOS 15.0,$999/$1099/$1299/$1499\nMax: $1099/$1199/$1399...


### Manually adding prices

In [7]:
df.at[2, 'price'] = '$199'
df.at[3, 'price'] = '$599'
df.at[5, 'price'] = '$649'

### Separate models of the same generation to separate rows

In [8]:
new_models = []
old_models = df.model

for model in old_models:
    new_models.append(model.split('/'))
df.model = new_models

df = df.explode('model').reset_index(drop=True)
df

Unnamed: 0,model,os,price
0,iPhone,iPhone OS 1.0,$499/$599*
1,iPhone 3G,iPhone OS 2.0,$199/$299*$599/$699
2,iPhone 3GS,iPhone OS 3.0,$199
3,iPhone 4,iOS 4.0,$599
4,iPhone 4S,iOS 5.0,$199/$299/$399*$649/$749/$849
5,iPhone 5,iOS 6.0,$649
6,iPhone 5C,iOS 7.0,$99/$199*$549/$649
7,iPhone 5S,iOS 7.0,$199/$299/$399*$649/$749/$849
8,iPhone 6,iOS 8.0,$199/$299/$399*$649/$749/$849Plus:$299/$399/$4...
9,6 Plus,iOS 8.0,$199/$299/$399*$649/$749/$849Plus:$299/$399/$4...


### Adding "iPhone" to the splitted rows

In [9]:
new_models = []
for model in df.model:
    if model.startswith('iPhone'):
        new_models.append(model)
    else:
        new_models.append(f'iPhone {model.strip()}')
df.model = new_models
df

Unnamed: 0,model,os,price
0,iPhone,iPhone OS 1.0,$499/$599*
1,iPhone 3G,iPhone OS 2.0,$199/$299*$599/$699
2,iPhone 3GS,iPhone OS 3.0,$199
3,iPhone 4,iOS 4.0,$599
4,iPhone 4S,iOS 5.0,$199/$299/$399*$649/$749/$849
5,iPhone 5,iOS 6.0,$649
6,iPhone 5C,iOS 7.0,$99/$199*$549/$649
7,iPhone 5S,iOS 7.0,$199/$299/$399*$649/$749/$849
8,iPhone 6,iOS 8.0,$199/$299/$399*$649/$749/$849Plus:$299/$399/$4...
9,iPhone 6 Plus,iOS 8.0,$199/$299/$399*$649/$749/$849Plus:$299/$399/$4...


### Renaming OS to iOS (Version)

In [10]:
old_os = df.os
new_os = []
for os in old_os:
    try:
        new_os.append(os.split(' ')[0]+' '+os.split(' ')[1])
    except:
        print(os)
df.os = new_os
df

Unnamed: 0,model,os,price
0,iPhone,iPhone OS,$499/$599*
1,iPhone 3G,iPhone OS,$199/$299*$599/$699
2,iPhone 3GS,iPhone OS,$199
3,iPhone 4,iOS 4.0,$599
4,iPhone 4S,iOS 5.0,$199/$299/$399*$649/$749/$849
5,iPhone 5,iOS 6.0,$649
6,iPhone 5C,iOS 7.0,$99/$199*$549/$649
7,iPhone 5S,iOS 7.0,$199/$299/$399*$649/$749/$849
8,iPhone 6,iOS 8.0,$199/$299/$399*$649/$749/$849Plus:$299/$399/$4...
9,iPhone 6 Plus,iOS 8.0,$199/$299/$399*$649/$749/$849Plus:$299/$399/$4...


### Separating special models' price (Mini, Max, Plus)

In [11]:
old_price = df.price
new_price = []
for price in old_price:
    try:
        new_price.append(price.split(':'))
    except:
        print(price)
df.price = new_price
df

Unnamed: 0,model,os,price
0,iPhone,iPhone OS,[$499/$599*]
1,iPhone 3G,iPhone OS,[$199/$299*$599/$699]
2,iPhone 3GS,iPhone OS,[$199]
3,iPhone 4,iOS 4.0,[$599]
4,iPhone 4S,iOS 5.0,[$199/$299/$399*$649/$749/$849]
5,iPhone 5,iOS 6.0,[$649]
6,iPhone 5C,iOS 7.0,[$99/$199*$549/$649]
7,iPhone 5S,iOS 7.0,[$199/$299/$399*$649/$749/$849]
8,iPhone 6,iOS 8.0,"[$199/$299/$399*$649/$749/$849Plus, $299/$399/..."
9,iPhone 6 Plus,iOS 8.0,"[$199/$299/$399*$649/$749/$849Plus, $299/$399/..."


### Assigning the correct starting prices to each model

In [12]:
new_price = []
for model,price in zip(df.model, df.price):
    if 'Max' in model or 'Mini' in model or 'Plus' in model:
        new_price.append(price[1])

    else:
        new_price.append(price[0])

df.price = new_price
df

Unnamed: 0,model,os,price
0,iPhone,iPhone OS,$499/$599*
1,iPhone 3G,iPhone OS,$199/$299*$599/$699
2,iPhone 3GS,iPhone OS,$199
3,iPhone 4,iOS 4.0,$599
4,iPhone 4S,iOS 5.0,$199/$299/$399*$649/$749/$849
5,iPhone 5,iOS 6.0,$649
6,iPhone 5C,iOS 7.0,$99/$199*$549/$649
7,iPhone 5S,iOS 7.0,$199/$299/$399*$649/$749/$849
8,iPhone 6,iOS 8.0,$199/$299/$399*$649/$749/$849Plus
9,iPhone 6 Plus,iOS 8.0,$299/$399/$499*Plus


### Correcting to price format to int

In [13]:
old_price = df.price
new_price = []
for price in old_price:
    new_price.append(int(price.replace('$','').split('/')[0]))
df.price = new_price
df

Unnamed: 0,model,os,price
0,iPhone,iPhone OS,499
1,iPhone 3G,iPhone OS,199
2,iPhone 3GS,iPhone OS,199
3,iPhone 4,iOS 4.0,599
4,iPhone 4S,iOS 5.0,199
5,iPhone 5,iOS 6.0,649
6,iPhone 5C,iOS 7.0,99
7,iPhone 5S,iOS 7.0,199
8,iPhone 6,iOS 8.0,199
9,iPhone 6 Plus,iOS 8.0,299


## Exporting to csv

In [15]:
df.to_csv('iphones.csv')