In [1]:
import pandas as pd
import numpy as np

In [2]:

members = [
    '../data/members/AMIC_ Digital.xlsx',   
    '../data/members/AMIC_ Papel.xlsx',
    '../data/members/ROW AMIC Papel + digital.xlsx'
]
sources = [
    '../data/sources/cac/DadesMapa-CAC.xlsx',
    '../data/sources/iberfier/iberifier_ procesado.xlsx',
    '../data/sources/iberfier/ROW iberifier.csv',
    '../data/sources/ojd/OJD DOWNLOAD.xlsx'
]

In [3]:
region_map = {
    'IB': ['balears', 'illes balears', 'balearic islands', 'balears; illes'],
    'CAT': ['catalunya', 'cataluña', 'catalonia'],
    'PV': ['país valencià', 'comunitat valenciana', 'valencian community', 'valencia'],
    'AD': ['andorra'],
    'ARA': ['aragó', 'aragon', 'aragón'],
    'SARDENYA': ['sardenya', 'sardinia'],
    'CAT NORD': ['catalunya nord', 'cat nord', 'catnord', 'cataluña nord'],
    'AN': ['andalusia', 'andalucía'],
    'AST': ['astúries', 'asturias'],
    'CANT': ['cantàbria', 'cantabria'],
    'CL': ['castella i lleó', 'castilla y león', 'castile and leon'],
    'CLM': ['castella-la manxa', 'castilla-la mancha', 'castile-la mancha'],
    'CEU': ['ceuta'],
    'CYM': ['canàries', 'canarias', 'canary islands'],
    'EXT': ['extremadura'],
    'GAL': ['galícia', 'galicia'],
    'MAD': ['madrid'],
    'MEL': ['melilla'],
    'MUR': ['múrcia', 'murcia'],
    'NAV': ['navarra', 'navarre'],
    'LR': ['la rioja'],
    'EUS': ['euskal herria', 'euskadi', 'país basc', 'basque country', 'país vasco']
}
def get_ccaa(region):
    for ccaa, regions in region_map.items():
        for r in regions:
            if r in region.lower():
                return ccaa
    return None


#### 1. Digital Sheet

In [4]:
def fetch_digital():
    digital = pd.read_excel(members[0], 'AMIC - Digital', skiprows=1, usecols=['Región', 'Mitjà', 'zona d\'influència', 'Región Origen', 'Area', 'OJD si/ no'])
    return clean_digital(digital)

def clean_digital(df: pd.DataFrame):
    df = df.rename(columns={
        'Región': 'region',
        'Mitjà': 'media',
        'zona d\'influència': 'area_of_influence',
        'Región Origen': 'province',
        'Area': 'area',
        'OJD si/ no': 'ojd'
    })
    df['region'].fillna('', inplace=True)
    df.dropna(subset=['media'], inplace=True)
    df['media'] = df['media'].apply(lambda x: x.split('//')[-1].split('/')[0])

    df['CCAA'] = df['region'].apply(get_ccaa)
    df['PROV'] = df['province'].fillna('').apply(lambda x: x.replace(' ', '').replace('/', '').strip())
    df['AREA'] = df['area'].fillna('').apply(lambda x: x.replace(' ', '').replace('/', '').strip())

    df.drop(columns=['region', 'province', 'area'], inplace=True)
    df['platform'] = 'digital'
    df.dropna(subset=['media'], inplace=True)
    return df

#### 2. Papel Sheet

In [7]:
def fetch_papel():
    papel = pd.read_excel(members[1], 'AMIC - Paper', usecols=['CCAA', 'PROV', 'ÀREA', 'PGD', 'mitjà', 'Area influencia', 'Distribució', 'CCAA2', 'PROV3', 'ÀREA4'])
    return clean_papel(papel)

def clean_papel(df: pd.DataFrame):
    df = df.rename(columns={
        'CCAA': 'region',
        'PROV': 'province',
        'ÀREA': 'area',
        'mitjà': 'media',
        'Area influencia': 'area_of_influence',
        'Distribució': 'distribution',
        'CCAA2': 'region2',
        'PROV3': 'province2',
        'ÀREA4': 'area2'
    })
    df.dropna(subset=['media'], inplace=True)
    df['media'] = df['media'].apply(lambda x: x.lower().strip())
    df['region'].fillna('', inplace=True)
    df['CCAA'] = df['region'].apply(get_ccaa)
    df['CCAA'].fillna(df['region2'], inplace=True)
    df['PROV'] = df['province2'].fillna('').apply(lambda x: x.replace(' ', '').replace('/', '').strip())
    df['AREA'] = df['area2'].fillna('').apply(lambda x: x.replace(' ', '').replace('/', '').strip())
    df.drop(columns=['region2', 'province2', 'area2', 'province', 'area', 'region'], inplace=True)

    df['platform'] = 'papel'
    return df


In [12]:
def fetch_iber():
    iber = pd.read_excel(sources[1], 'iberfier (2)', usecols=['TITLE', 'ACTIVE', 'URL_web', 'platforms', 'province', 'region', 'LOCATION_company_address', 'LOCATION_company_zipcode', 'lat', 'lng'])
    return clean_iber(iber)


def clean_iber(df: pd.DataFrame):
    df = df.rename(columns={
        'TITLE': 'media',
        'ACTIVE': 'active_iber',
        'URL_web': 'url',
        'platforms': 'platform_iber',
        'LOCATION_company_address': 'address_iber',
        'LOCATION_company_zipcode': 'zipcode_iber',
        'lat': 'lat_iber',
        'lng': 'lng_iber',
        'province': 'province_iber',
    })
    df.dropna(subset=['media'], inplace=True)
    df['media'] = df['media'].apply(lambda x: x.lower().strip())

    df['region'].fillna('', inplace=True)
    df['CCAA'] = df['region'].apply(get_ccaa)
    df['url'] = df['url'].apply(lambda x: x.split('//')[-1].split('/')[0].split('www.')[-1])

    df.drop(columns=['region'], inplace=True)
    return df

###  main( )

In [18]:
digital = fetch_digital()
papel = fetch_papel()
stacked = pd.concat([digital, papel], ignore_index=True)
stacked['Origin'] = 'AMIC'
iber = fetch_iber()

In [41]:
iber.shape, stacked.shape

((704, 10), (556, 10))

In [36]:
merged_url = pd.merge(stacked, iber, left_on='media', right_on='url', how='inner', suffixes=('', '_iber'))
merged_url['Origin'] = merged_url['Origin'] + ', Iberfier'
stacked_unmatched = stacked[~stacked['media'].isin(merged_url['media'])]
merged_media = pd.merge(stacked_unmatched, iber, on='media', how='inner', suffixes=('', '_iber'))
merged_media['Origin'] = merged_media['Origin'] + ', Iberfier'

stacked_unmatched_final = stacked_unmatched[~stacked_unmatched['media'].isin(merged_media['media'])]
iber_unmatched = iber[~iber['media'].isin(pd.concat([merged_url['media'], merged_media['media']]))]
iber_unmatched_final = iber_unmatched[~iber_unmatched['url'].isin(pd.concat([merged_url['media'], merged_media['media']]))]
iber_unmatched_final.rename(columns={'CCAA': 'CCAA_iber', 'media': 'media_iber'}, inplace=True)
iber_unmatched_final['Origin'] = 'Iberfier'

final_merged = pd.concat([merged_url, merged_media, stacked_unmatched_final, iber_unmatched_final], ignore_index=True).reset_index(drop=True)
final_merged.rename(columns={'url': 'url_iber'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iber_unmatched_final.rename(columns={'CCAA': 'CCAA_iber', 'media': 'media_iber'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iber_unmatched_final['Origin'] = 'Iberfier'


In [37]:
# stacked.to_csv('../data/processed/stacked.csv', index=False)
# final_merged.to_csv('../data/processed/merged.csv', index=False)

In [38]:
stacked

Unnamed: 0,media,area_of_influence,ojd,CCAA,PROV,AREA,platform,PGD,distribution,Origin
0,fibwidiario.com,Balears,,IB,IB,,digital,,,AMIC
1,laconca51.cat,Catalunya,OJD,CAT,CAT,,digital,,,AMIC
2,festacatalunya.cat,ori,OJD,CAT,CAT,,digital,,,AMIC
3,adolescents.cat,Catalunya,OJD,CAT,CAT,,digital,,,AMIC
4,eldiari.online,Castelló,,PV,PV,,digital,,,AMIC
...,...,...,...,...,...,...,...,...,...,...
551,semanario valle de elda,Alacant,,PV,AL,,papel,,Alacant: Elda i Petrer,AMIC
552,suheca,Ribera Baixa,,PV,VAL,,papel,,"Ribera Baixa: Sueca, El Perelló i Mareny de Ba...",AMIC
553,top girona,Gironés,,CAT,GI,,papel,,Demarcació de Girona,AMIC
554,tucomarca.com,València,,PV,VAL,,papel,,"València: Alborache, Buñol, Cheste, Chiva, Cor...",AMIC


In [39]:
final_merged

Unnamed: 0,media,area_of_influence,ojd,CCAA,PROV,AREA,platform,PGD,distribution,Origin,media_iber,active_iber,url_iber,address_iber,zipcode_iber,lat_iber,lng_iber,platform_iber,province_iber,CCAA_iber
0,adolescents.cat,Catalunya,OJD,CAT,CAT,,digital,,,"AMIC, Iberfier",adolescents.cat,True,adolescents.cat,"De la Llotja 9, Vic 08500 (Barcelona)",8500.0,41.925725,2.246502,web,Barcelona,CAT
1,adolescents.cat,Catalunya,no OJD,CAT,CAT,,digital,,,"AMIC, Iberfier",adolescents.cat,True,adolescents.cat,"De la Llotja 9, Vic 08500 (Barcelona)",8500.0,41.925725,2.246502,web,Barcelona,CAT
2,laopiniondetorrent.es,"Torrent, València",,PV,PV,V,digital,,,"AMIC, Iberfier",l'opinió de torrent,True,laopiniondetorrent.es,"Gómez Ferrer, 34 - 46900 Torrent (València)",46900.0,39.438741,-0.463961,"web, print",Valencia/València,PV
3,fosbury.cat,Catalunya,OJD,CAT,CAT,,digital,,,"AMIC, Iberfier",fosbury,True,fosbury.cat,"Riera del Pare Fita, 70, 08350 Arenys de Mar, ...",8350.0,41.584184,2.547518,"web, print",Barcelona,CAT
4,comunicacio21.cat,Catalunya,OJD,CAT,CAT,,digital,,,"AMIC, Iberfier",comunicació 21,True,comunicacio21.cat,"Mallorca, 354 entlo. 1ª -Barcelona - 08013",8013.0,41.400612,2.171728,"app, print, web",Barcelona,CAT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1190,,,,,,,,,,Iberfier,viu molins de rei,True,viumolinsderei.com,"Àngel Guimerà, 8, Molins de Rei, 08750 Molins ...",8750.0,41.410856,2.019801,web,Barcelona,CAT
1191,,,,,,,,,,Iberfier,vox uji radio,True,radio.uji.es,"Avinguda Sos Baynat, s/n, 12071 Castelló de la...",12071.0,39.992369,-0.064846,"web, radio",Castellón/Castelló,PV
1192,,,,,,,,,,Iberfier,weekand sabadell,True,weekand.net,"Carrer de Blasco de Garay, 68 2n 1a, 08202, Sa...",8202.0,41.542981,2.115876,web,Barcelona,CAT
1193,,,,,,,,,,Iberfier,weloba,False,weloba.cat,"Aribau, 230, C.P 08006, Barcelona",8006.0,41.395469,2.150343,web,Barcelona,CAT


In [40]:
final_merged['Origin'].value_counts()

Origin
Iberfier          543
AMIC              372
AMIC, Iberfier    280
Name: count, dtype: int64