# 1. Downloading Datasets
Instead of downloading/unzipping the data seperately we will do that in the script as well so it is automated. We can use the terminal command curl which allows us to download the files directly

In [1]:
import pathlib

PROJECT_DIR = pathlib.Path().resolve().parent.parent
DATA_DIR = PROJECT_DIR / "data"
ZIPS_DIR = DATA_DIR / "zips"
ZIPS_DIR.mkdir(exist_ok=True, parents=True)

# paths where the zipped data would be downloaded to locally
SMS_SPAM_ZIP_PATH = ZIPS_DIR / "sms-spam-dataset.zip"
YOUTUBE_SPAM_ZIP_PATH = ZIPS_DIR / "youtube-spam-dataset.zip"

# paths to the zipped data is to be downloaded from
SMS_SPAM_ZIP = r"https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
YOUTUBE_SPAM_ZIP = r"https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip"

In [2]:
# terminal command to download the data the data to our specified paths
!curl $SMS_SPAM_ZIP -o $SMS_SPAM_ZIP_PATH
!curl $YOUTUBE_SPAM_ZIP -o $YOUTUBE_SPAM_ZIP_PATH

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  198k  100  198k    0     0   198k      0  0:00:01 --:--:--  0:00:01  453k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  159k  100  159k    0     0   159k      0  0:00:01 --:--:--  0:00:01  378k


In [3]:
# paths where the dataset would be unzipped to
SPAM_CLASSIFIER_DIR = DATA_DIR / "spam-classifier"
SMS_SPAM_DIR = SPAM_CLASSIFIER_DIR / "sms_spam"
YOUTUBE_SPAM_DIR = SPAM_CLASSIFIER_DIR / "youtube_spam"

SMS_SPAM_DIR.mkdir(exist_ok=True, parents=True)
YOUTUBE_SPAM_DIR.mkdir(exist_ok=True, parents=True)

In [4]:
# unzipping the downloaded dataset to the paths defined above
# windows
!tar -xf $SMS_SPAM_ZIP_PATH -C $SMS_SPAM_DIR
!tar -xf $YOUTUBE_SPAM_ZIP_PATH -C $YOUTUBE_SPAM_DIR

# linux / mac (-o flag will override the previous zip if the command if run again)
#!unzip -o $SMS_SPAM_ZIP_PATH -d $SMS_SPAM_DIR
#!unzip -o $YOUTUBE_SPAM_ZIP_PATH -d $YOUTUBE_SPAM_DIR

# 2. Extracting, Reviewing and Combining Datasets
In general for big data, if we are gonna eb storing in a db then we would keep all the data and store them individually. 

In [9]:
import pandas as pd

In [37]:
sms_spam_input_path = SMS_SPAM_DIR / "SMSSpamCollection" # tsv
#sms_spam_input_path.read_text()

sms_df = pd.read_csv(sms_spam_input_path, sep="\t", header=None, names=["label", "text"])
sms_df["raw_label"] = sms_df["label"].apply(lambda x: 1 if x=="spam" else 0)
sms_df["source"] = "sms-spam" # to distingush later
sms_df.head()

Unnamed: 0,label,text,raw_label,source
0,ham,"Go until jurong point, crazy.. Available only ...",0,sms-spam
1,ham,Ok lar... Joking wif u oni...,0,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,sms-spam
3,ham,U dun say so early hor... U c already then say...,0,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,sms-spam


In [38]:
youtube_dfs = []
for path in YOUTUBE_SPAM_DIR.glob("*.csv"): # reads only csv files in the folder
    df = pd.read_csv(path)
    df.rename(columns={"CLASS": "raw_label", "CONTENT": "text"}, inplace=True)
    df["label"] = df["raw_label"].apply(lambda x: "spam" if x==1 else "ham")
    df["source"] = "youtube-spam"
    df = df[["text","raw_label","label","source"]]
    youtube_dfs.append(df)

# combine all dataframes
youtube_df = pd.concat(youtube_dfs)
youtube_df.head()

Unnamed: 0,text,raw_label,label,source
0,"Huh, anyway check out this you[tube] channel: ...",1,spam,youtube-spam
1,Hey guys check out my new channel and our firs...,1,spam,youtube-spam
2,just for test I have to say murdev.com,1,spam,youtube-spam
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,spam,youtube-spam
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1,spam,youtube-spam


In [41]:
data = pd.concat([sms_df, youtube_df])
data.head()

Unnamed: 0,label,text,raw_label,source
0,ham,"Go until jurong point, crazy.. Available only ...",0,sms-spam
1,ham,Ok lar... Joking wif u oni...,0,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,sms-spam
3,ham,U dun say so early hor... U c already then say...,0,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,sms-spam
