In [None]:
import pandas as pd
import re
from datetime import datetime

In [None]:
# Show all rows and cols
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## David Crosby

In [None]:
david_crosby_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/raw/wikipedia-david-crosby.csv')

# Format release_date to yyyy-dd-mm
david_crosby_wiki['release_date'] = pd.to_datetime(david_crosby_wiki['release_date'], format = '%B %d, %Y')
david_crosby_wiki['release_date'] = david_crosby_wiki['release_date'].dt.strftime('%Y-%d-%m')

# Clean track names from brackets
david_crosby_wiki['track'] = david_crosby_wiki['track'].str.replace(r'\s*\(.*?\)\s*', '', regex=True)

# Remove square brackets from 'recording_period'
pattern_brackets = r'\[\d+\]'
david_crosby_wiki['recording_period'] = david_crosby_wiki['recording_period'].str.replace(pattern_brackets, '', regex=True)

david_crosby_wiki.to_csv('wikipedia-david-crosby-v2.csv', index=False)
david_crosby_wiki

## Stephen Stills

In [None]:
stephen_stills_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/raw/wikipedia-stephen-stills.csv')

# Remove excess albums (by other bands)
stephen_stills_wiki = stephen_stills_wiki[stephen_stills_wiki['album'] != 'Manassas']
stephen_stills_wiki = stephen_stills_wiki[stephen_stills_wiki['album'] != 'Down the Road']
stephen_stills_wiki = stephen_stills_wiki[stephen_stills_wiki['album'] != "Pieces"]
stephen_stills_wiki = stephen_stills_wiki[stephen_stills_wiki['album'] != "Pierced Arrow"]
stephen_stills_wiki = stephen_stills_wiki[stephen_stills_wiki['album'] != "Can't Get Enough"]

# Format 'release_date'. Remove duplicated dates in round brackets
pattern_date = r'\([^)]*\)'
stephen_stills_wiki['release_date'] = stephen_stills_wiki['release_date'].str.replace(pattern_date, '', regex=True)
stephen_stills_wiki['release_date'] = pd.to_datetime(stephen_stills_wiki['release_date'], dayfirst=True, errors='coerce')
stephen_stills_wiki['release_date'] = stephen_stills_wiki['release_date'].dt.strftime('%Y-%d-%m')
stephen_stills_wiki['release_date'] = stephen_stills_wiki['release_date'].fillna('2016-06-05') # fill rows missing with 2016-05-06

# Format 'recording period'. Remove recording locations. Remove word 'Late'
pattern_locations = r'^[^0-9]*$'
stephen_stills_wiki['recording_period'] = stephen_stills_wiki['recording_period'].str.replace(pattern_locations, '', regex=True)
stephen_stills_wiki['recording_period'] = stephen_stills_wiki['recording_period'].str.replace('Late', '', regex=False).str.strip()

# Remove square brackets from 'tracks'
pattern_brackets = r'\[\d+\]'
stephen_stills_wiki['track'] = stephen_stills_wiki['track'].str.replace(pattern_brackets, '', regex=True)

# Remove all parts in brackets that contain '(Later recorder' to get the original track title
stephen_stills_wiki['track'] = stephen_stills_wiki['track'].str.replace(r'\(Later recorded.*?\)', '', regex=True)

# Fill NaN values in 'vocals'
stephen_stills_wiki['vocals'] = stephen_stills_wiki['vocals'].fillna('Stephen Stills')

# Fill NaN values in 'writers'
stephen_stills_wiki['writers'] = stephen_stills_wiki['writers'].fillna('Stephen Stills')

# Remove duplicated tracks
stephen_stills_wiki = stephen_stills_wiki[stephen_stills_wiki['track'] != "Albert's Shuffle (2002 remix without horns)"] # 8	Albert's Shuffle (2002 remix without horns)
stephen_stills_wiki = stephen_stills_wiki[stephen_stills_wiki['track'] != "Season of the Witch (2002 remix without horns)"] # 9	Season of the Witch (2002 remix without horns)	Super Session

stephen_stills_wiki.reset_index(drop=True, inplace=True)

stephen_stills_wiki.to_csv('wikipedia-stephen-stills-v2.csv', index=False)

stephen_stills_wiki

## Graham Nash

In [None]:
graham_nash_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/raw/wikipedia-graham-nash.csv')

# Remove excess albums (live, compilations, by other bands)
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != 'Stay with the Hollies']
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != 'In The Hollies Style']
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != 'Hollies']
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != 'The Hollies']
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != 'Would You Believe?']
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != 'Bus Stop']
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != 'For Certain Because']
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != 'Evolution']
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != 'Butterfly']
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != 'What Goes Around...']
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != 'Just One Look']
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != 'Here I Go Again']
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != "I'm Alive"]
graham_nash_wiki = graham_nash_wiki[graham_nash_wiki['album'] != "Over the Years"]

graham_nash_wiki.reset_index(drop=True, inplace=True)

# Format 'release_date'. Remove square and round brackets
pattern_brackets = r'\[\d+\]' # Remove square brackets
graham_nash_wiki['release_date'] = graham_nash_wiki['release_date'].str.replace(pattern_brackets, '', regex=True)
pattern_date = r'\([^)]*\)' # Remove round brackets
graham_nash_wiki['release_date'] = graham_nash_wiki['release_date'].str.replace(pattern_date, '', regex=True)

def convert_date_format(date_string):
  try:
        # Try to convert "Month YYYY" format
        return pd.to_datetime(date_string, format='%B %Y').strftime('%Y-%m')
  except ValueError:
        try:
            # Try to convert "YYYY" format
            return pd.to_datetime(date_string, format='%Y').strftime('%Y-%m')
        except ValueError:
            try:
                # Try to convert "Day Month YYYY" format
                return pd.to_datetime(date_string, format='%d %B %Y').strftime('%Y-%d-%m')
            except ValueError:
                return None

graham_nash_wiki['release_date'] = graham_nash_wiki['release_date'].apply(convert_date_format)

# Fill NaN values in 'vocals'
graham_nash_wiki['vocals'] = graham_nash_wiki['vocals'].fillna('Graham Nash')

# Fill NaN values in 'writers'
graham_nash_wiki['writers'] = graham_nash_wiki['writers'].fillna('Graham Nash')

# Fill NaN values in 'recording_period'
graham_nash_wiki['recording_period'] = graham_nash_wiki['recording_period'].fillna('')

# Remove location from 'recording_period' [0:8]
graham_nash_wiki.iloc[0:9, graham_nash_wiki.columns.get_loc('recording_period')] = graham_nash_wiki.iloc[0:9, graham_nash_wiki.columns.get_loc('recording_period')].str.slice(0, 4)

# Remove square brackets from 'recording_period'
graham_nash_wiki['recording_period'] = graham_nash_wiki['recording_period'].str.replace(pattern_brackets, '', regex=True)

# Remove square brackets from 'producers'
graham_nash_wiki['producers'] = graham_nash_wiki['producers'].str.replace(pattern_brackets, '', regex=True)

# Format producer names
graham_nash_wiki['producers'] = graham_nash_wiki['producers'].str.replace('NashTodd', 'Nash, Todd', regex=False)
graham_nash_wiki['producers'] = graham_nash_wiki['producers'].str.replace('Shane Fontayne,', 'Shane Fontayne', regex=False)

graham_nash_wiki.to_csv('wikipedia-graham-nash-v2.csv', index=False)
graham_nash_wiki

## Neil Young

In [None]:
neil_young_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/raw/wikipedia-neil-young.csv')

# Remove excess albums (live, compilations, by other bands)
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Re·ac·tor']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Life']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Ragged Glory']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Sleeps with Angels']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Broken Arrow']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Greendale']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Americana']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Psychedelic Pill']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Colorado']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'World Record']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Live Rust']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Year of the Horse']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != "Fuckin' Up"]
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Way Down in the Rust Bucket']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Toast']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Odeon Budokan']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Noise & Flowers']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Tuscaloosa']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'The Monsanto Years']
neil_young_wiki = neil_young_wiki[neil_young_wiki['album'] != 'Are You Passionate?']

# Format 'release_date'. Remove square and round brackets
pattern_brackets = r'\[\d+\]' # Remove square brackets
neil_young_wiki['release_date'] = neil_young_wiki['release_date'].str.replace(pattern_brackets, '', regex=True)
pattern_date = r'\([^)]*\)' # Remove round brackets
neil_young_wiki['release_date'] = neil_young_wiki['release_date'].str.replace(pattern_date, '', regex=True)

# Format dates to yyyy-dd-mm in 'release_date'
neil_young_wiki['release_date'] = pd.to_datetime(neil_young_wiki['release_date'], format = '%B %d, %Y')
neil_young_wiki['release_date'] = neil_young_wiki['release_date'].dt.strftime('%Y-%d-%m')

# Remove locations from 'recording period'. Remove duplicated dates in 'recording_period'
neil_young_wiki['recording_period'] = neil_young_wiki['recording_period'].str.replace(pattern_date, '', regex=True)
# Remove square brackets from 'recording_period'
neil_young_wiki['recording_period'] = neil_young_wiki['recording_period'].str.replace(pattern_brackets, '', regex=True)

# Split by comma dates in 'recording_period'
neil_young_wiki['recording_period'] = neil_young_wiki['recording_period'].str.replace('June 2014', 'June 2014, ', regex=False)
neil_young_wiki['recording_period'] = neil_young_wiki['recording_period'].str.replace('November 4 – 7, 2016', 'November 4 – 7, 2016, ', regex=False)
neil_young_wiki['recording_period'] = neil_young_wiki['recording_period'].str.replace('June 9, 2017', 'June 9, 2017, ', regex=False)
neil_young_wiki['recording_period'] = neil_young_wiki['recording_period'].str.replace('March 28 – May 28, 2010', 'March 28 – May 28, 2010, ', regex=False)
neil_young_wiki['recording_period'] = neil_young_wiki['recording_period'].str.replace('October 10 – 12, 2011', 'October 10 – 12, 2011, ', regex=False)
neil_young_wiki['recording_period'] = neil_young_wiki['recording_period'].str.replace('January 28, 2019', 'January 28, 2019, ', regex=False)
neil_young_wiki['recording_period'] = neil_young_wiki['recording_period'].str.replace('February 4, 2019', 'February 4, 2019, ', regex=False)
neil_young_wiki['recording_period'] = neil_young_wiki['recording_period'].str.replace('November 8 – December 15, 2000', 'November 8 – December 15, 2000, ', regex=False)

# Remove square brackets from 'producers'
neil_young_wiki['producers'] = neil_young_wiki['producers'].str.replace(pattern_brackets, '', regex=True)

# Replace 'John Hanlon,' by 'John Hanlon' and 'Niko Bolas,' by 'Niko Bolas' in 'producers'
neil_young_wiki['producers'] = neil_young_wiki['producers'].str.replace('John Hanlon,', 'John Hanlon', regex=False)
neil_young_wiki['producers'] = neil_young_wiki['producers'].str.replace('Niko Bolas,', 'Niko Bolas', regex=False)

# Fill NaN in 'vocals'
neil_young_wiki['vocals'] = neil_young_wiki['vocals'].fillna('Neil Young')

# Fill NaN in 'writers'
neil_young_wiki['writers'] = neil_young_wiki['writers'].fillna('Neil Young')

# Remove duplicated tracks

neil_young_wiki.to_csv('wikipedia-neil-young-v2.csv', index=False)

neil_young_wiki

## Crosby & Nash

In [None]:
crosby_nash_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/raw/wikipedia-cn.csv')

# Format dates to yyyy-dd-mm in 'release_date'
def convert_date_format(date_str):
    try:
        if ',' in date_str: # check if format is "Month digit, year"
            return datetime.strptime(date_str, '%B %d, %Y').strftime('%Y-%d-%m') # convert to desired date format
        else:
            return date_str # if only year is found, keep format 'yyyy'
    except ValueError:
        return date_str # if conversion isn't possible, return the original date

crosby_nash_wiki['release_date'] = crosby_nash_wiki['release_date'].apply(convert_date_format)

# Remove square brackets from 'track'
pattern_brackets = r'\[\d+\]' # Remove square brackets
crosby_nash_wiki['track'] = crosby_nash_wiki['track'].str.replace(pattern_brackets, '', regex=True)

# Fill NaN values in 'recording_period'
crosby_nash_wiki['recording_period'] = crosby_nash_wiki['recording_period'].fillna('')

# Fill NaN values in 'writers' and 'vocals'
crosby_nash_wiki['writers'] = crosby_nash_wiki['writers'].fillna('Crosby & Nash')
crosby_nash_wiki['vocals'] = crosby_nash_wiki['vocals'].fillna('Crosby & Nash')

# Remove location from 'recording_period'
crosby_nash_wiki = crosby_nash_wiki[~crosby_nash_wiki['recording_period'].str.lower().str.startswith('rudy')]
crosby_nash_wiki = crosby_nash_wiki[~crosby_nash_wiki['writers'].str.lower().str.startswith('traditional')]

crosby_nash_wiki.to_csv('wikipedia-crosby-nash-v2.csv', index=False)

crosby_nash_wiki

##Crosby, Stills & Nash | Crosby, Stills, Nash & Young

In [None]:
csn_csny_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/raw/wikipedia-csn-csny.csv')

# Remove round brackets from 'release_date'
pattern_date = r'\([^)]*\)' # Remove round brackets
csn_csny_wiki['release_date'] = csn_csny_wiki['release_date'].str.replace(pattern_date, '', regex=True)

# Format dates to yyyy-dd-mm in 'release_date'
csn_csny_wiki['release_date'] = pd.to_datetime(csn_csny_wiki['release_date'], format = '%B %d, %Y')
csn_csny_wiki['release_date'] = csn_csny_wiki['release_date'].dt.strftime('%Y-%d-%m')

# Split by blank space dates in 'recording_period'
csn_csny_wiki['recording_period'] = csn_csny_wiki['recording_period'].str.replace('January1970', 'January 1970', regex=False)

# Replace ' with ' by comma in 'vocals'
csn_csny_wiki['vocals'] = csn_csny_wiki['vocals'].str.replace(' with ', ', ', regex=False)


# Remove duplicated tracks:
csn_csny_wiki = csn_csny_wiki[~csn_csny_wiki['track'].str.contains('mix', case=False)]
csn_csny_wiki = csn_csny_wiki[~csn_csny_wiki['track'].str.contains('alternate', case=False)]
csn_csny_wiki = csn_csny_wiki[~csn_csny_wiki['track'].str.contains('early version', case=False)]
csn_csny_wiki = csn_csny_wiki[~csn_csny_wiki['track'].str.contains('with harmonica', case=False)]

csn_csny_wiki.reset_index(drop=True, inplace=True)
csn_csny_wiki.to_csv('wikipedia-csn-csny-v2.csv', index=False)
csn_csny_wiki

## Merge all DataFrames to 'wikipedia-v2.csv'

In [None]:
8stills_young_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/raw/wikipedia-stills-young-band.csv')
wikipedia_v2 = pd.concat([david_crosby_wiki,
                         stephen_stills_wiki,
                         graham_nash_wiki,
                         neil_young_wiki,
                         stills_young_wiki,
                         crosby_nash_wiki,
                         csn_csny_wiki
                         ]).reset_index(drop=True)
wikipedia_v2.to_csv('wikipedia-v2.csv', index=False)
wikipedia_v2

Unnamed: 0,track,album,release_date,recording_period,producers,writers,vocals
0,Drive My Car,Oh Yes I Can,1989-23-01,June–October 1988,"David Crosby, Craig Doerge",Crosby,
1,Melody,Oh Yes I Can,1989-23-01,June–October 1988,"David Crosby, Craig Doerge","Crosby, Craig Doerge",
2,Monkey and the Underdog,Oh Yes I Can,1989-23-01,June–October 1988,"David Crosby, Craig Doerge","Crosby, Doerge",
3,In the Wide Ruin,Oh Yes I Can,1989-23-01,June–October 1988,"David Crosby, Craig Doerge","Doerge, Henske",
4,Tracks in the Dust,Oh Yes I Can,1989-23-01,June–October 1988,"David Crosby, Craig Doerge",Crosby,
5,Drop Down Mama,Oh Yes I Can,1989-23-01,June–October 1988,"David Crosby, Craig Doerge",Crosby,
6,Lady of the Harbor,Oh Yes I Can,1989-23-01,June–October 1988,"David Crosby, Craig Doerge","Crosby, Doerge",
7,Distances,Oh Yes I Can,1989-23-01,June–October 1988,"David Crosby, Craig Doerge",Crosby,
8,Flying Man,Oh Yes I Can,1989-23-01,June–October 1988,"David Crosby, Craig Doerge","Crosby, Doerge",
9,Oh Yes I Can,Oh Yes I Can,1989-23-01,June–October 1988,"David Crosby, Craig Doerge",Crosby,


## Merge all DataFrames to 'wikipedia-v3.csv'

In [None]:
# stills_young_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/raw/wikipedia-stills-young-band.csv')
# david_crosby_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/v3/wikipedia-david-crosby-v3.csv')
# stephen_stills_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/v3/wikipedia-stephen-stills-v3.csv')
# graham_nash_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/v3/wikipedia-graham-nash-v3.csv')
# neil_young_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/v3/wikipedia-neil-young-v3.csv')
# crosby_nash_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/v3/wikipedia-crosby-nash-v3.csv')
# csn_csny_wiki = pd.read_csv('/content/drive/MyDrive/Eksplorativna analiza podataka sa Spotify-a/Wikipedia Web Scraping/Wikipedia Datasets/v3/wikipedia-csn-csny-v3.csv')
# wikipedia_v3 = pd.concat([david_crosby_wiki,
#                          stephen_stills_wiki,
#                          graham_nash_wiki,
#                          neil_young_wiki,
#                          stills_young_wiki,
#                          crosby_nash_wiki,
#                          csn_csny_wiki
#                          ]).reset_index(drop=True)
# wikipedia_v3.to_csv('wikipedia-v3.csv', index=False)
# wikipedia_v3