NLP SCRIPT

In [1]:
import pandas as pd
import spacy
from langdetect import detect, LangDetectException

# Load language models
nlp_de = spacy.load('de_core_news_sm')
nlp_fr = spacy.load('fr_core_news_sm')
nlp_it = spacy.load('it_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')

# Keywords for each feature in multiple languages
keywords = {
    'terrace': {
        'de': ['terrasse', 'balkon', 'dachterrasse', 'veranda', 'freiluftbereich', 'attika', 'attikawohnung', 'loggia', 'sonnenterrasse', 'außenbereich', 'terrassenwohnung', 'dachgarten', 'terrassengarten', 'sonnendeck'],
        'fr': ['terrasse', 'balcon', 'toit-terrasse', 'véranda', 'espace extérieur', 'attique', 'terrassement', 'terrasses', 'balcons'],
        'it': ['terrazza', 'terrazzo', 'terrazzato', 'balcone', 'balconata', 'veranda', 'area esterna', 'attico', 'attica', 'solarium'],
        'en': ['terrace', 'balcony', 'roof terrace', 'veranda', 'outdoor area', 'penthouse', 'deck', 'patio']
    },
    'garden': {
        'de': ['garten', 'grünfläche', 'hof', 'park', 'rasenfläche', 'gartenbereich', 'gartenanlage', 'privatgarten', 'gemeinschaftsgarten', 'parkanlage', 'hofgarten', 'gartenteich', 'gartenlandschaft', 'Gartenanteil'],
        'fr': ['jardin', 'espace vert', 'cour', 'parc', 'pelouse', 'jardinet', 'parterre', 'verdure'],
        'it': ['giardino', 'area verde', 'cortile', 'parco', 'prato', 'giardinetto', 'orto', 'verde pubblico', 'parterre'],
        'en': ['garden', 'green space', 'yard', 'park', 'lawn', 'backyard', 'courtyard', 'garden area', 'greensward']
    },
    'new': {
        'de': ['neubau', 'neu gebaut', 'neu entwickelt', 'brandneu', 'neukonstruktion', 'bauprojekt', 'neubauprojekt', 'neubauten', 'erstbezug', 'neuerstellung', 'neueröffnung', 'fabrikneu', 'neubaugebiet'],
        'fr': ['neuf', 'construction récente', 'développé récemment', 'tout neuf', 'projet de construction', 'nouvelle construction', 'nouvellement construit', 'flambant neuf', 'nouvelle'],
        'it': ['nuova costruzione', 'nuovo', 'appena costruito', 'sviluppato di recente', 'nuovissimo', 'progetto edilizio', 'nuova edificazione', 'nuova'],
        'en': ['new build', 'newly built', 'recently developed', 'brand new', 'construction project', 'new development', 'newly constructed', 'newly renovated']
    },
    'renovated': {
        'de': ['renoviert', 'modernisiert', 'saniert', 'überholt', 'restauriert', 'renovierte', 'kernsaniert', 'refurbished', 'instandgesetzt', 'aufgefrischt', 'modernisierung', 'sanierung'],
        'fr': ['rénové', 'modernisé', 'assaini', 'refait', 'restauré', 'rénovation', 'modernisation', 'assainissement', 'refaite'],
        'it': ['ristrutturato', 'modernizzato', 'sistemato', 'rifatto', 'restaurato', 'rinnovato', 'ammodernato', 'aggiornato', 'ristrutturazione'],
        'en': ['renovated', 'modernized', 'refurbished', 'overhauled', 'restored', 'revamped', 'updated', 'renewed']
    },
    'old': {
        'de': ['altbau', 'alt', 'traditionell', 'historisch', 'erbstück', 'altbauflair', 'altbauwohnung', 'baudenkmal', 'denkmalschutz', 'ursprünglich', 'zeitlos', 'antik', 'alte bausubstanz'],
        'fr': ['ancien', 'traditionnel', 'historique', 'héritage', 'ancienne construction', 'vieille bâtisse', 'patrimoine', 'historiquement'],
        'it': ['vecchia costruzione', 'antico', 'tradizionale', 'storico', 'eredità', 'vecchio', 'antica', 'storica'],
        'en': ['old building', 'old', 'traditional', 'historical', 'heritage', 'antique', 'vintage', 'classic']
    },
    'view': {
        'de': ['aussicht', 'blick', 'panorama', 'sicht', 'ausblick', 'seesicht', 'fernblick', 'weitblick', 'bergblick', 'stadtblick', 'rundumblick', 'panoramablick'],
        'fr': ['vue', 'perspective', 'panorama', 'vue dégagée', 'panoramique', 'vue panoramique', 'vue sur la mer', 'vue montagne'],
        'it': ['vista', 'panorama', 'prospettiva', 'vista aperta', 'panoramica', 'panoramici', 'vista mare', 'vista montagna'],
        'en': ['view', 'sight', 'panorama', 'outlook', 'seaview', 'panoramic', 'scenic view', 'landscape view']
    },
    'city_center': {
        'de': ['stadtzentrum', 'zentral', 'mitte', 'kerngebiet', 'stadtmitte', 'zentrumsnah', 'innenstadt', 'stadtzentral', 'city', 'zentrumslage', 'innenstadtnah', 'stadtkern', 'stadtnah'],
        'fr': ['centre-ville', 'central', 'coeur de ville', 'centre urbain', 'hyper-centre', 'en plein centre', 'centrale', 'centre'],
        'it': ['centro città', 'centrale', 'cuore della città', 'centro urbano', 'centro', 'nel centro', 'cuore del centro', 'centrale', 'nucleo'],
        'en': ['city center', 'central', 'downtown', 'heart of the city', 'urban center', 'city core', 'town center', 'city centre']
    } ,  
    'luxus': {
        'de': ['luxus', 'villa', 'penthaus', 'attika', 'Attikawohnung', 'luxuriös', 'luxuriöse', 'luxuriöses', 'Attika-Wohnung'],
        'fr': ['luxe', 'villa', 'penthouse', 'attique', 'luxueux'],
        'it': ['lusso', 'villa', 'attico', 'lussuoso', 'lussuosa'],
        'en': ['luxury', 'villa', 'penthouse', 'luxurious']
    },
    
}

# Function to detect and process features in multilingual text
def detect_and_process(row):
    text = (row['Title'] + " " + row['Description']).lower()
    try:
        language = detect(text)
    except LangDetectException:
        language = 'de'  # Default to German if detection fails

    nlp = nlp_de if language == 'de' else nlp_fr if language == 'fr' else nlp_it if language == 'it' else nlp_en
    doc = nlp(text)

    # Initialize default values for boolean features
    row['city_center'] = False
    row['garden'] = False
    row['terrace'] = False
    row['view'] = False
    row['luxus'] = False 

    # Set default condition
    row['condition'] = 'old'

    for feature, lang_keywords in keywords.items():
        if language in lang_keywords:
            if feature in ['city_center', 'garden', 'terrace', 'view', 'luxus']:  # Including Luxus as a boolean feature
                row[feature] = any(token.text in lang_keywords[language] for token in doc)
            if feature in ['new', 'renovated', 'old'] and any(token.text in lang_keywords[language] for token in doc):
                row['condition'] = feature

    return row

# Read data from CSV
try:
    df = pd.read_csv('Double_Final_Filtered_immodata.csv', sep=';')
except Exception as e:
    print(f"Error reading the CSV file: {e}")
    exit()

# Ensure descriptions and titles are strings and handle missing data
df['Description'] = df['Description'].fillna('').astype(str)
df['Title'] = df['Title'].fillna('').astype(str)

# Apply language-specific detection and feature extraction
df = df.apply(detect_and_process, axis=1)

# Save the enriched DataFrame to a new CSV file
df.to_csv('NLP_enriched_immoscout24.csv', index=False)

print("New CSV with enriched data has been saved.")


New CSV with enriched data has been saved.
