#1.- Create a database with all cities and towns from Catalonia

As the project focuses on a database of news from diferent newspapers from Catalonia, the database will contain every town and city of Catalonia. This information will be extracted from a database given by the "Institut Cartogràfic de Catalunya".

In [15]:
# Initialize a MongoDB instance
from pymongo import MongoClient

def get_db_ciutats():
    client = MongoClient('localhost:27017')
    db = client.ciutats
    return db

def add_city_db(db, name, tipus, lat, lon):
    db.ciutats.insert({"name":name, "tipus":tipus,"lat":lat, "lon":lon})

In [18]:
import xlrd
from collections import OrderedDict
import simplejson as json
import utm

def ini_Catalan_cities():
    print "Loading file..."
    # Open the workbook and select the first worksheet
    wb = xlrd.open_workbook('CartoCat.xlsx')
    sh = wb.sheet_by_index(0)
    print "File loaded correctly."
    db = get_db_ciutats()
    
    print "Conversio iniciada"
    # Iterate through each row in worksheet and fetch values into dict
    for rownum in range(1, sh.nrows):
        item = OrderedDict()
        row_values = sh.row_values(rownum)
        item['nom'] = row_values[0]
        item['tipus'] = row_values[1]
        #item['municipi'] = row_values[2]
        #item['comarca'] = row_values[6]
        item['utmX'] = row_values[15]
        item['utmY'] = row_values[16]
        #if item['tipus']!= 'cap':
        #    continue
        if item['utmX']==0.0 or item['utmY']==0.0:
            continue
        #if db.ciutats.find({'name':item['nom']}).count()>0:
            #continue
        u = utm.to_latlon(item['utmX'],item['utmY'], 31, 'T')
        item['lat'] = u[0]
        item['lon'] = u[1]
        add_city_db(db,item['nom'],item['tipus'],item['lat'],item['lon'])
    print "Conversio finalitzada"
    print "Elements in db.ciutats: " , db.ciutats.count()

In [19]:
# Create the first database, the one containing all Catalan cities
# The following code should be executed once, so to prevent accidental execution and because of the long time
# the code lasts it is commented:

#ini_Catalan_cities()

Loading file...
File loaded correctly.
Conversio iniciada
Conversio finalitzada
Elements in db.ciutats:  52698


## Query example
Example of how a MongoDB can be queried to find an item by a key.

In [22]:
# Get the information related to my town: Calaf
db = get_db_ciutats()
poble = db.ciutats.find({'name':'Calaf'})
for a in poble:
    print a

{u'lat': 41.734805925564594, u'_id': ObjectId('554f9478366044344065476a'), u'lon': 1.5137381450247307, u'name': u'Calaf', u'tipus': u'cap'}


In [141]:
# Get the full list of towns and cities in Catalonia. There are currently 947 towns in Catalonia, so we expect
# a list of towns as long as that.
db = get_db_ciutats()
db.ciutats.find().count()
ciut = db.ciutats.find({'tipus':'cap'})
print ciut.count()
all_city_names = []
for a in ciut:
    print a['name']
    all_city_names.append(a['name'])

947
Abella, l'
Abella de la Conca
Abrera
Àger
Agramunt
Aguilar de Segarra
Agullana
Aiguafreda
Aiguaviva
Aitona
Alamús, els
Alàs
Albagés, l'
Albanyà
Albatàrrec
Albesa
Albi, l'
Albinyana
Albiol, l'
Albons
Alcanar
Alcanó
Alcarràs
Alcoletge
Alcover
Aldea, l'
Aldover
Aleixar, l'
Alella
Alfara de Carles
Alfarràs
Alfés
Alforja
Algerri
Alguaire
Alins
Alió
All
Almacelles
Almatret
Almenar
Almoster
Alòs de Balaguer
Alp
Alpens
Alpicat
Altafulla
Amer
Ametlla del Vallès, l'
Ametlla de Mar, l'
Ampolla, l'
Amposta
Anglès
Anglesola
Anserall
Ansovell
Arbeca
Arboç, l'
Arbolí
Arbúcies
Arenys de Mar
Arenys de Munt
Argelaguer
Argençola
Argentera, l'
Argentona
Armentera, l'
Arnes
Arres de Jos
Arsèguel
Artés
Artesa de Lleida
Artesa de Segre
Ascó
Aspa
Avellanes, les
Avià
Avinyó
Avinyonet de Puigventós
Avinyó Nou
Badalona
Badia del Vallès
Bagà
Balaguer
Balsareny
Banyeres del Penedès
Banyoles
Barbens
Barberà de la Conca
Barberà del Vallès
Barcelona
Barruera
Bàscara
Bassella
Batea
Bausen
Begues
Begur
Belianes
Bel

## Types of elements in the database

The main elements that can be found among other less important are the following:

Abbreviature | Meaning
---    |   ---
'cap'  | Cap de municipi
'barri'| Barri, sector urbà (+50.000 hab.)
'nucli'| Nucli de població (poble, llogaret...)
'diss.'| Veïnat disseminat
'e.m.d.'| Entitat municipal descentralitzada
mun. | Nom del municipi quan aquest no coincideix amb la capital
edif. | Edificació aïllada
edif. hist. | Edifici històric (ermita, església, castell...)


In [23]:
db.ciutats.count()

52698

# 2. Add to the database some important international cities

To achieve this objective, I will be using SPARQL language, that let obtain information from for example DBPEDIA, a database containing information extracted from the WIKIPEDIA.

In [33]:
import json
from SPARQLWrapper import SPARQLWrapper, JSON
 
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)
 
sparql.setQuery("""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX yago: <http://dbpedia.org/class/yago/>
PREFIX dbpedia-owl: <http://dbpedia.org/ontology/>
 
SELECT ?title ?geolat ?geolong
    WHERE {
        #?place rdf:type <http://dbpedia.org/ontology/Place> .
        #?place dbpedia-owl:country <http://dbpedia.org/resource/Spain> .
        ?place foaf:name ?title .
        ?place geo:lat ?geolat .
        ?place geo:long ?geolong .
        #FILTER ((?geolong > 0.5 && ?geolong < 2.7) && (?geolat < 42.5 && ?geolat > 40.5))
        #FILTER (LANG(?title)='ca')
    }
""")
results = sparql.query().convert()
print results

{u'head': {u'link': [], u'vars': [u'title', u'geolat', u'geolong']}, u'results': {u'distinct': False, u'bindings': [], u'ordered': True}}


# 2.- Create a database of news

In order to create a database that will contain news from a few digital news sites from Catalonia, I will be fetching the information from their RSS publications. I will get the <strong>title</strong>, the <strong>description</strong>, the <strong>date of publication</strong> and the <strong>name of the news site</strong>.

In [34]:
import feedparser
from bs4 import BeautifulSoup
from time import mktime
import time
from datetime import datetime

def get_db_news():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.noticies
    return db

def add_news(db, diari, data, titol, desc):
    db.noticies.insert({"diari":diari, "data":data, "titol":titol, "descripcio":desc})  

def get_news(diari,web, today=False):
    db = get_db_news()
    rss = web
    feed = feedparser.parse(rss)
    print len(feed["entries"])
    vals = []
    for key in feed["entries"]: 
        title = BeautifulSoup(key["title"]).get_text()
        date = datetime.fromtimestamp(mktime(key["published_parsed"]))
        #print date
        date_formated = date.strftime("%d/%m/%Y")
        if today and date_formated != time.strftime("%d/%m/%Y"):
            print date_formated
            continue
        if db.noticies.find({"titol":title}).count() > 0:
            continue
        #desc_formated = BeautifulSoup(unidecode.unidecode(key["description"])).get_text()
        desc_formated = BeautifulSoup(key["description"]).get_text()
        #print desc_formated.get_text()
        ret_val = [date_formated,title,desc_formated]
        #print ret_val
        add_news(db,diari,date,title,desc_formated)
        vals.append(ret_val)
    return vals

import os

def get_news_job():
    print "******* Running process:", os.getpid()
    get_news('ara','http://www.ara.cat/rss/')
    get_news('regio7','http://www.regio7.cat/elementosInt/rss/1')
    get_news('vilaweb','http://www.vilaweb.cat/rss/')
    db = get_db_news()
    print db.noticies.find().count()
    print "******* Process ended!"



In [198]:
get_news_job()

*******Running process: 13376
70
16
21
1491
*******Process ended!


#3.- Identify cities in a text

First, I create a train database to check results

In [132]:
db = get_db_news()
noticies = db.noticies.find()
#db.create_collection('train_set')
i = 1
array_noticies = []
for noticia in noticies:
    if i>100:
        continue
    array_noticies.append([noticia['diari'],noticia['data'],noticia['titol'],noticia['descripcio'],"None"])
    i += 1
# add a field that will contain the location of the text
print len(array_noticies)
for i in range (11,100):
    print array_noticies[i][3]
    array_noticies[i][4] = raw_input()

100
Aquest diumenge l'ARA dedica el dossier a l'Europa de després de Hitler. El líder nazi es va suïcidar el 30 d'abril disparant-se un tret a la templa. El seu cos i el de la seva dona, Eva Braun (s'havien casat el dia anterior), van ser incinerats. El reportatge abordarà els efectes del conflicte bèl·lic sobre Europa i com va ser l'endemà en un continent devastat que va haver de reconstruir-se. Detallarem com mesos després van continuar els assassinats de jueus, les matances contra minories ètniques, la venjança contra els col·laboracionistes... Com els camins d'una Europa sense transports, amb cultius i ciutats arrasades, es van omplir de milions de desplaçats. I què va passar a Espanya, un país en plena dictadura. A més, tot el diari estarà 'tunejat' amb frases de testimonis, escriptors, historiadors i altres fonts sobre l'Europa de postguerra.Diumenge mateix començarà la promoció dels cartells de la Segona Guerra Mundial, una selecció feta pel diari que començarà amb l’emblemàtic 

In [139]:
i=0
for x in array_noticies:
    if x[4]!='None':
        i+=1
        print i,
        print x[4]

1 Berlín
2 Barcelona
3 Berlín
4 Nova York
5 Castelló
6 Madrid
7 Qatar
8 Andalusia
9 Hospital del Mar
10 Premià de Mar
11 Reus
12 Kàtmandu
13 Pyongyang
14 Donetsk
15 Baltimore
16 Milà
17 Barcelona
18 Poble-sec
19 Barcelona
20 el Carmel
21 Pou de la Figuera
22 Teheran
23 Illinois
24 Barcelona
25 Denver
26 Sevilla
27 Còrdova
28 Bassano
29 Vic
30 Las Vegas
31 Mallorca
32 Barcelona
33 Cambridge
34 Sabadell
35 la Garriga
36 València
37 Teheran
38 Tarragona
39 Madrid
40 Manresa
41 la Sèquia
42 Berga
43 Puigcerdà


In [113]:
# We set the values to the train database
for noti in db.train_set.find():
    print noti

In [95]:
text = 'Tots els gats son de Sant Cugat del Valles.'
text = text.replace('.','').split(' ')

cities =  {"Sant Cugat del Valles":["Sant","Cugat","del","Valles"]}

found_match = False
for word in text:
    if found_match:        
        cityTest = cityTest
    else:
        cityTest = ''
    found_match = False
    for city in cities.keys():

        if word in cities[city]:
            cityTest += word + ' '
            found_match = True        
        if cityTest.split(' ')[0:-1] == city.split(' '):
            print city    #Print if it found a city.

Sant Cugat del Valles


In [265]:
mides = []
clean_all_cities = []
with open('all_city_names.txt') as f:
    for linia in (line.strip() for line in f):
        mides.append(len(linia.split(',')[0].split(' ')))
        print linia.split(',')[0].split(' ') # treiem el que hi hagi després de la ','
        clean_all_cities.append(linia.split(',')[0].replace('*','').split(' '))
        
    print 'La longitud dels noms de pobles catalans va de', min(mides), 'a', max(mides), 'paraules.'

['Abella']
['Abella', 'de', 'la', 'Conca']
['Abrera']
['\xc3\x80ger']
['Agramunt']
['Aguilar', 'de', 'Segarra']
['Agullana']
['Aiguafreda']
['Aiguaviva']
['Aitona']
['Alam\xc3\xbas']
['Al\xc3\xa0s']
['Albag\xc3\xa9s']
['Albany\xc3\xa0']
['Albat\xc3\xa0rrec']
['Albesa']
['Albi']
['Albinyana']
['Albiol']
['Albons']
['Alcanar']
['Alcan\xc3\xb3']
['Alcarr\xc3\xa0s']
['Alcoletge']
['Alcover']
['Aldea']
['Aldover']
['Aleixar']
['Alella']
['Alfara', 'de', 'Carles']
['Alfarr\xc3\xa0s']
['Alf\xc3\xa9s']
['Alforja']
['Algerri']
['Alguaire']
['Alins']
['Ali\xc3\xb3']
['All']
['Almacelles']
['Almatret']
['Almenar']
['Almoster']
['Al\xc3\xb2s', 'de', 'Balaguer']
['Alp']
['Alpens']
['Alpicat']
['Altafulla']
['Amer']
['Ametlla', 'del', 'Vall\xc3\xa8s']
['Ametlla', 'de', 'Mar']
['Ampolla']
['Amposta']
['Angl\xc3\xa8s']
['Anglesola']
['Anserall']
['Ansovell']
['Arbeca']
['Arbo\xc3\xa7']
['Arbol\xc3\xad']
['Arb\xc3\xbacies']
['Arenys', 'de', 'Mar']
['Arenys', 'de', 'Munt']
['Argelaguer']
['Argen\xc3\xa7

In [288]:
# 1.- create an array with all the first uppercase words of the database
from collections import Counter
from collections import defaultdict
starting_words = defaultdict(list)
for linia in clean_all_cities:
    starting_words[' '.join(linia).split(' ')[0]].append(len(linia))


print starting_words['Barberà']

[4, 3]


In [354]:
# tinc frases del text amb noms de llocs
with open('nt-1.txt') as f:
    haystack = f.read().split()
    num_words = len(haystack)
    possible_cities = []
    for i in range(0,num_words):
        if haystack[i][0].isupper() and haystack[i] in starting_words:
            length_city_name_min = min(starting_words[haystack[i]])
            length_city_name_max = max(starting_words[haystack[i]])
            possible_cities.append([' '.join(haystack[i:i+length_city_name]),length_city_name_min,length_city_name_max])
res = []
clean_cities_string = []
for a in clean_all_cities:
    clean_cities_string.append(' '.join(a))
for s in possible_cities:
    for i in range(s[2],s[1]-1,-1):
        w = ' '.join(s[0].replace(',','').split(' ')[0:i])
        if w in clean_cities_string:
            res.append(w)
print Counter(res)
print 'Ciutat més probable:',Counter(res).most_common(1)

        

Counter({'Sant Cugat del Vall\xc3\xa8s': 2, 'Matar\xc3\xb3': 1})
Ciutat més probable: [('Sant Cugat del Vall\xc3\xa8s', 2)]


In [155]:
#corrector
import re, collections

def words(text): 
    return re.findall('[a-z]+', text.lower()) 

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

#NWORDS = train(words(file('big.txt').read()))
NWORDS = train(words(file('all_city_names.txt').read()))

alphabet = 'abcçdefghijklmnopqrstuvwxyzàèéiòóúí'

def edits1(word):
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   deletes    = [a + b[1:] for a, b in splits if b]
   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
   replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
   inserts    = [a + c + b     for a, b in splits for c in alphabet]
   return set(deletes + transposes + replaces + inserts)

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words): return set(w for w in words if w in NWORDS)

def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return max(candidates, key=NWORDS.get)

In [315]:
correct('Sant Cugat del Vallès i')

'Sant Cugat del Vall\xc3\xa8s i'

h
