In [6]:
import pandas as pd
import spacy

# Load language models
nlp_de = spacy.load('de_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')
nlp_fr = spacy.load('fr_core_news_sm')
nlp_it = spacy.load('it_core_news_sm')

# Define the mapping of languages to their respective NLP models
language_models = {
    'de': nlp_de,
    'en': nlp_en,
    'fr': nlp_fr,
    'it': nlp_it
}

# Keywords for each feature in multiple languages
keywords = {
    'terrace': {
        'de': ['terrasse', 'balkon', 'dachterrasse', 'veranda', 'freiluftbereich'],
        'en': ['terrace', 'balcony', 'deck', 'veranda', 'patio'],
        'fr': ['terrasse', 'balcon', 'pont', 'veranda', 'patio'],
        'it': ['terrazza', 'balcone', 'ponte', 'veranda', 'patio']
    },
    'garden': {
        'de': ['garten', 'grünfläche', 'hof', 'park', 'rasenfläche'],
        'en': ['garden', 'green space', 'yard', 'lawn', 'parkland'],
        'fr': ['jardin', 'espace vert', 'cour', 'pelouse', 'parc'],
        'it': ['giardino', 'verde', 'cortile', 'prato', 'parco']
    },
    'new': {
        'de': ['neubau', 'neu gebaut', 'neu entwickelt', 'brandneu', 'neukonstruktion'],
        'en': ['new build', 'newly built', 'new development', 'brand new', 'new construction'],
        'fr': ['nouvelle construction', 'récent', 'nouveau développement', 'flambant neuf', 'construction récente'],
        'it': ['nuova costruzione', 'nuovo', 'sviluppo recente', 'nuovissimo', 'edificazione recente']
    },
    'renovated': {
        'de': ['renoviert', 'modernisiert', 'saniert', 'überholt', 'restauriert'],
        'en': ['renovated', 'refurbished', 'modernized', 'overhauled', 'restored'],
        'fr': ['rénové', 'restauré', 'modernisé', 'réhabilité', 'réaménagé'],
        'it': ['ristrutturato', 'rinnovato', 'modernizzato', 'ristrutturazione completa', 'restaurato']
    },
    'old': {
        'de': ['altbau', 'alt', 'traditionell', 'historisch', 'erbstück'],
        'en': ['old building', 'antique', 'traditional', 'historic', 'heritage'],
        'fr': ['ancien', 'traditionnel', 'vieux', 'historique', 'patrimonial'],
        'it': ['antico', 'tradizionale', 'vecchio', 'storico', 'patrimonio']
    },
    'view': {
        'de': ['aussicht', 'blick', 'panorama', 'sicht', 'ausblick'],
        'en': ['view', 'panorama', 'vista', 'outlook', 'scenery'],
        'fr': ['vue', 'panorama', 'perspective', 'horizon', 'paysage'],
        'it': ['vista', 'panorama', 'scorcio', 'orizzonte', 'paesaggio']
    },
    'city_center': {
        'de': ['stadtzentrum', 'zentral', 'mitte', 'kerngebiet', 'stadtmitte'],
        'en': ['city center', 'downtown', 'central', 'urban core', 'town center'],
        'fr': ['centre ville', 'central', 'coeur de ville', 'centre urbain', 'centre-ville'],
        'it': ['centro città', 'centrale', 'cuore della città', 'nucleo urbano', 'centro storico']
    }
}

# Read data from CSV
try:
    df = pd.read_csv('AATest_aargau.csv', sep=';')
except Exception as e:
    print(f"Error reading the CSV file: {e}")
    exit()

# Ensure descriptions and titles are strings and handle missing data
df['Description'] = df['Description'].fillna('').astype(str)
df['Title'] = df['Title'].fillna('').astype(str)
df['combined_info'] = df['Title'] + " " + df['Description']

# Detect language and apply feature extraction
def detect_and_process_language(row):
    text = row['combined_info'].lower()
    nlp = None

    for lang, model in language_models.items():
        if model.vocab.lang in text:
            nlp = model
            break

    # Default to German
    if nlp is None:
        nlp = language_models['de']

    doc = nlp(text)
    for feature, lang_keywords in keywords.items():
        row[feature] = any(token.text in lang_keywords[nlp.vocab.lang] for token in doc)
    return row

# Apply the language detection and feature extraction
df = df.apply(detect_and_process_language, axis=1)

# Save the enriched DataFrame to a new CSV file
df.to_csv('enriched_properties.csv', index=False)

print("New CSV with enriched data has been saved.")


New CSV with enriched data has been saved.


In [8]:
import pandas as pd
import spacy

# Load German language model
nlp_de = spacy.load('de_core_news_sm')

# Keywords for each feature in German
keywords = {
    'terrace': [
        'terrasse', 'balkon', 'dachterrasse', 'veranda', 'freiluftbereich', 
        'attika', 'attikawohnung', 'loggia', 'sonnenterrasse', 'außenbereich', 
        'terrassenwohnung', 'dachgarten'
    ],
    'garden': [
        'garten', 'grünfläche', 'hof', 'park', 'rasenfläche', 'gartenbereich', 
        'gartenanlage', 'privatgarten', 'gemeinschaftsgarten', 'parkanlage', 
        'hofgarten'
    ],
    'new': [
        'neubau', 'neu gebaut', 'neu entwickelt', 'brandneu', 'neukonstruktion', 
        'bauprojekt', 'neubauprojekt', 'neubauten', 'erstbezug', 'neuerstellung', 
        'neueröffnung'
    ],
    'renovated': [
        'renoviert', 'modernisiert', 'saniert', 'überholt', 'restauriert', 
        'renovierte', 'kernsaniert', 'refurbished', 'instandgesetzt', 'aufgefrischt', 
        'modernisierung'
    ],
    'old': [
        'altbau', 'alt', 'traditionell', 'historisch', 'erbstück', 'altbauflair', 
        'altbauwohnung', 'baudenkmal', 'denkmalschutz', 'ursprünglich', 'zeitlos'
    ],
    'view': [
        'aussicht', 'blick', 'panorama', 'sicht', 'ausblick', 'seesicht', 
        'fernblick', 'weitblick', 'bergblick', 'stadtblick', 'rundumblick'
    ],
    'city_center': [
        'stadtzentrum', 'zentral', 'mitte', 'kerngebiet', 'stadtmitte', 'zentrumsnah', 
        'innenstadt', 'stadtzentral', 'city', 'zentrumslage', 'innenstadtnah'
    ]
}


# Read data from CSV
try:
    df = pd.read_csv('AATest_aargau.csv', sep=';')
except Exception as e:
    print(f"Error reading the CSV file: {e}")
    exit()

# Ensure descriptions and titles are strings and handle missing data
df['Description'] = df['Description'].fillna('').astype(str)
df['Title'] = df['Title'].fillna('').astype(str)

# Function to detect and process features in German text
def detect_and_process_german(row):
    text = (row['Title'] + " " + row['Description']).lower()
    doc = nlp_de(text)
    for feature, feature_keywords in keywords.items():
        row[feature] = any(token.text in feature_keywords for token in doc)
    
    # Merging new, renovated, and old into one condition variable
    condition_keywords = keywords['new'] + keywords['renovated'] + keywords['old']
    if any(keyword in text for keyword in condition_keywords[:len(keywords['new'])]):
        row['condition'] = 'New'
    elif any(keyword in text for keyword in condition_keywords[len(keywords['new']):len(keywords['new']) + len(keywords['renovated'])]):
        row['condition'] = 'Renovated'
    else:
        row['condition'] = 'Old'
    
    return row

# Apply the language detection and feature extraction
df = df.apply(detect_and_process_german, axis=1)

# Remove the individual condition-related variables
df.drop(columns=['new', 'renovated', 'old'], inplace=True)

# Save the enriched DataFrame to a new CSV file
df.to_csv('enriched_properties.csv', index=False)

print("New CSV with enriched data has been saved.")


New CSV with enriched data has been saved.


WORKING NLP SCRIPT

In [3]:
import pandas as pd
import spacy
from langdetect import detect, LangDetectException

# Load language models
nlp_de = spacy.load('de_core_news_sm')
nlp_fr = spacy.load('fr_core_news_sm')
nlp_it = spacy.load('it_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')

# Keywords for each feature in multiple languages
keywords = {
    'terrace': {
        'de': ['terrasse', 'balkon', 'dachterrasse', 'veranda', 'freiluftbereich', 'attika', 'attikawohnung', 'loggia', 'sonnenterrasse', 'außenbereich', 'terrassenwohnung', 'dachgarten', 'terrassengarten', 'sonnendeck'],
        'fr': ['terrasse', 'balcon', 'toit-terrasse', 'véranda', 'espace extérieur', 'attique', 'terrassement', 'terrasses', 'balcons'],
        'it': ['terrazza', 'terrazzo', 'terrazzato', 'balcone', 'balconata', 'veranda', 'area esterna', 'attico', 'attica', 'solarium'],
        'en': ['terrace', 'balcony', 'roof terrace', 'veranda', 'outdoor area', 'penthouse', 'deck', 'patio']
    },
    'garden': {
        'de': ['garten', 'grünfläche', 'hof', 'park', 'rasenfläche', 'gartenbereich', 'gartenanlage', 'privatgarten', 'gemeinschaftsgarten', 'parkanlage', 'hofgarten', 'gartenteich', 'gartenlandschaft', 'Gartenanteil'],
        'fr': ['jardin', 'espace vert', 'cour', 'parc', 'pelouse', 'jardinet', 'parterre', 'verdure'],
        'it': ['giardino', 'area verde', 'cortile', 'parco', 'prato', 'giardinetto', 'orto', 'verde pubblico', 'parterre'],
        'en': ['garden', 'green space', 'yard', 'park', 'lawn', 'backyard', 'courtyard', 'garden area', 'greensward']
    },
    'new': {
        'de': ['neubau', 'neu gebaut', 'neu entwickelt', 'brandneu', 'neukonstruktion', 'bauprojekt', 'neubauprojekt', 'neubauten', 'erstbezug', 'neuerstellung', 'neueröffnung', 'fabrikneu', 'neubaugebiet'],
        'fr': ['neuf', 'construction récente', 'développé récemment', 'tout neuf', 'projet de construction', 'nouvelle construction', 'nouvellement construit', 'flambant neuf', 'nouvelle'],
        'it': ['nuova costruzione', 'nuovo', 'appena costruito', 'sviluppato di recente', 'nuovissimo', 'progetto edilizio', 'nuova edificazione', 'nuova'],
        'en': ['new build', 'newly built', 'recently developed', 'brand new', 'construction project', 'new development', 'newly constructed', 'newly renovated']
    },
    'renovated': {
        'de': ['renoviert', 'modernisiert', 'saniert', 'überholt', 'restauriert', 'renovierte', 'kernsaniert', 'refurbished', 'instandgesetzt', 'aufgefrischt', 'modernisierung', 'sanierung'],
        'fr': ['rénové', 'modernisé', 'assaini', 'refait', 'restauré', 'rénovation', 'modernisation', 'assainissement', 'refaite'],
        'it': ['ristrutturato', 'modernizzato', 'sistemato', 'rifatto', 'restaurato', 'rinnovato', 'ammodernato', 'aggiornato', 'ristrutturazione'],
        'en': ['renovated', 'modernized', 'refurbished', 'overhauled', 'restored', 'revamped', 'updated', 'renewed']
    },
    'old': {
        'de': ['altbau', 'alt', 'traditionell', 'historisch', 'erbstück', 'altbauflair', 'altbauwohnung', 'baudenkmal', 'denkmalschutz', 'ursprünglich', 'zeitlos', 'antik', 'alte bausubstanz'],
        'fr': ['ancien', 'traditionnel', 'historique', 'héritage', 'ancienne construction', 'vieille bâtisse', 'patrimoine', 'historiquement'],
        'it': ['vecchia costruzione', 'antico', 'tradizionale', 'storico', 'eredità', 'vecchio', 'antica', 'storica'],
        'en': ['old building', 'old', 'traditional', 'historical', 'heritage', 'antique', 'vintage', 'classic']
    },
    'view': {
        'de': ['aussicht', 'blick', 'panorama', 'sicht', 'ausblick', 'seesicht', 'fernblick', 'weitblick', 'bergblick', 'stadtblick', 'rundumblick', 'panoramablick'],
        'fr': ['vue', 'perspective', 'panorama', 'vue dégagée', 'panoramique', 'vue panoramique', 'vue sur la mer', 'vue montagne'],
        'it': ['vista', 'panorama', 'prospettiva', 'vista aperta', 'panoramica', 'panoramici', 'vista mare', 'vista montagna'],
        'en': ['view', 'sight', 'panorama', 'outlook', 'seaview', 'panoramic', 'scenic view', 'landscape view']
    },
    'city_center': {
        'de': ['stadtzentrum', 'zentral', 'mitte', 'kerngebiet', 'stadtmitte', 'zentrumsnah', 'innenstadt', 'stadtzentral', 'city', 'zentrumslage', 'innenstadtnah', 'stadtkern'],
        'fr': ['centre-ville', 'central', 'coeur de ville', 'centre urbain', 'hyper-centre', 'en plein centre', 'centrale', 'centre'],
        'it': ['centro città', 'centrale', 'cuore della città', 'centro urbano', 'centro', 'nel centro', 'cuore del centro', 'centrale'],
        'en': ['city center', 'central', 'downtown', 'heart of the city', 'urban center', 'city core', 'town center', 'city centre']
    }
}


# Function to detect and process features in multilingual text
def detect_and_process(row):
    text = (row['Title'] + " " + row['Description']).lower()
    try:
        language = detect(text)
    except LangDetectException:
        language = 'de'  # Default language or exception handling as needed
    nlp = nlp_de if language == 'de' else nlp_fr if language == 'fr' else nlp_it if language == 'it' else nlp_en
    doc = nlp(text)
    for feature, lang_keywords in keywords.items():
        if language in lang_keywords:
            row[feature] = any(token.text in lang_keywords[language] for token in doc)
    return row

# Read data from CSV
try:
    df = pd.read_csv('final_filtered_immoscout24.csv', sep=';')
except Exception as e:
    print(f"Error reading the CSV file: {e}")
    exit()

# Ensure descriptions and titles are strings and handle missing data
df['Description'] = df['Description'].fillna('').astype(str)
df['Title'] = df['Title'].fillna('').astype(str)

# Apply language-specific detection and feature extraction
df = df.apply(detect_and_process, axis=1)

# Save the enriched DataFrame to a new CSV file
df.to_csv('final_filtered_NLP_immoscout24.csv', index=False)

print("New CSV with enriched data has been saved.")


New CSV with enriched data has been saved.


WITH CONDITION VARIABLE

In [1]:
import pandas as pd
import spacy
from langdetect import detect, LangDetectException

# Load language models
nlp_de = spacy.load('de_core_news_sm')
nlp_fr = spacy.load('fr_core_news_sm')
nlp_it = spacy.load('it_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')

# Keywords for each feature in multiple languages
keywords = {
    'terrace': {
        'de': ['terrasse', 'balkon', 'dachterrasse', 'veranda', 'freiluftbereich', 'attika', 'attikawohnung', 'loggia', 'sonnenterrasse', 'außenbereich', 'terrassenwohnung', 'dachgarten', 'terrassengarten', 'sonnendeck'],
        'fr': ['terrasse', 'balcon', 'toit-terrasse', 'véranda', 'espace extérieur', 'attique', 'terrassement', 'terrasses', 'balcons'],
        'it': ['terrazza', 'terrazzo', 'terrazzato', 'balcone', 'balconata', 'veranda', 'area esterna', 'attico', 'attica', 'solarium'],
        'en': ['terrace', 'balcony', 'roof terrace', 'veranda', 'outdoor area', 'penthouse', 'deck', 'patio']
    },
    'garden': {
        'de': ['garten', 'grünfläche', 'hof', 'park', 'rasenfläche', 'gartenbereich', 'gartenanlage', 'privatgarten', 'gemeinschaftsgarten', 'parkanlage', 'hofgarten', 'gartenteich', 'gartenlandschaft', 'Gartenanteil'],
        'fr': ['jardin', 'espace vert', 'cour', 'parc', 'pelouse', 'jardinet', 'parterre', 'verdure'],
        'it': ['giardino', 'area verde', 'cortile', 'parco', 'prato', 'giardinetto', 'orto', 'verde', 'parterre'],
        'en': ['garden', 'green space', 'yard', 'park', 'lawn', 'backyard', 'courtyard', 'garden area', 'greensward']
    },
    'new': {
        'de': ['neubau', 'neu gebaut', 'neu entwickelt', 'brandneu', 'neukonstruktion', 'bauprojekt', 'neubauprojekt', 'neubauten', 'erstbezug', 'neuerstellung', 'neueröffnung', 'fabrikneu', 'neubaugebiet'],
        'fr': ['neuf', 'construction récente', 'développé récemment', 'tout neuf', 'projet de construction', 'nouvelle construction', 'nouvellement construit', 'flambant neuf', 'nouvelle'],
        'it': ['nuova costruzione', 'nuovo', 'appena costruito', 'sviluppato di recente', 'nuovissimo', 'progetto edilizio', 'nuova edificazione', 'nuova'],
        'en': ['new build', 'newly built', 'recently developed', 'brand new', 'construction project', 'new development', 'newly constructed', 'newly renovated']
    },
    'renovated': {
        'de': ['renoviert', 'modernisiert', 'saniert', 'überholt', 'restauriert', 'renovierte', 'kernsaniert', 'refurbished', 'instandgesetzt', 'aufgefrischt', 'modernisierung', 'sanierung'],
        'fr': ['rénové', 'modernisé', 'assaini', 'refait', 'restauré', 'rénovation', 'modernisation', 'assainissement', 'refaite'],
        'it': ['ristrutturato', 'modernizzato', 'sistemato', 'rifatto', 'restaurato', 'rinnovato', 'ammodernato', 'aggiornato', 'ristrutturazione'],
        'en': ['renovated', 'modernized', 'refurbished', 'overhauled', 'restored', 'revamped', 'updated', 'renewed']
    },
    'old': {
        'de': ['altbau', 'alt', 'traditionell', 'historisch', 'erbstück', 'altbauflair', 'altbauwohnung', 'baudenkmal', 'denkmalschutz', 'ursprünglich', 'zeitlos', 'antik', 'alte bausubstanz'],
        'fr': ['ancien', 'traditionnel', 'historique', 'héritage', 'ancienne construction', 'vieille bâtisse', 'patrimoine', 'historiquement'],
        'it': ['vecchia costruzione', 'antico', 'tradizionale', 'storico', 'eredità', 'vecchio', 'antica', 'storica'],
        'en': ['old building', 'old', 'traditional', 'historical', 'heritage', 'antique', 'vintage', 'classic']
    },
    'view': {
        'de': ['aussicht', 'blick', 'panorama', 'sicht', 'ausblick', 'seesicht', 'fernblick', 'weitblick', 'bergblick', 'stadtblick', 'rundumblick', 'panoramablick'],
        'fr': ['vue', 'perspective', 'panorama', 'vue dégagée', 'panoramique', 'vue panoramique', 'vue sur la mer', 'vue montagne'],
        'it': ['vista', 'panorama', 'prospettiva', 'vista aperta', 'panoramica', 'panoramici', 'vista mare', 'vista montagna'],
        'en': ['view', 'sight', 'panorama', 'outlook', 'seaview', 'panoramic', 'scenic view', 'landscape view']
    },
    'city_center': {
        'de': ['stadtzentrum', 'zentral', 'mitte', 'kerngebiet', 'stadtmitte', 'zentrumsnah', 'innenstadt', 'stadtzentral', 'city', 'zentrumslage', 'innenstadtnah', 'stadtkern', 'stadtnah'],
        'fr': ['centre-ville', 'central', 'coeur de ville', 'centre urbain', 'hyper-centre', 'en plein centre', 'centrale', 'centre'],
        'it': ['centro città', 'centrale', 'cuore della città', 'centro urbano', 'centro', 'nel centro', 'cuore del centro', 'centrale', 'nucleo'],
        'en': ['city center', 'central', 'downtown', 'heart of the city', 'urban center', 'city core', 'town center', 'city centre']
    }
}

# Function to detect and process features in multilingual text
def detect_and_process(row):
    text = (row['Title'] + " " + row['Description']).lower()
    try:
        language = detect(text)
    except LangDetectException:
        language = 'de'  # Default to German if detection fails

    nlp = nlp_de if language == 'de' else nlp_fr if language == 'fr' else nlp_it if language == 'it' else nlp_en
    doc = nlp(text)

    # Initialize default values for boolean features
    row['city_center'] = False
    row['garden'] = False
    row['terrace'] = False
    row['view'] = False

    # Set default condition
    row['condition'] = 'old'

    for feature, lang_keywords in keywords.items():
        if language in lang_keywords:
            # Check for boolean features
            if feature in ['city_center', 'garden', 'terrace', 'view']:
                row[feature] = any(token.text in lang_keywords[language] for token in doc)
            # Set condition based on keywords
            if feature in ['new', 'renovated', 'old'] and any(token.text in lang_keywords[language] for token in doc):
                row['condition'] = feature

    return row

# Read data from CSV
try:
    df = pd.read_csv('final_filtered_immoscout24.csv', sep=';')
except Exception as e:
    print(f"Error reading the CSV file: {e}")
    exit()

# Ensure descriptions and titles are strings and handle missing data
df['Description'] = df['Description'].fillna('').astype(str)
df['Title'] = df['Title'].fillna('').astype(str)

# Apply language-specific detection and feature extraction
df = df.apply(detect_and_process, axis=1)

# Save the enriched DataFrame to a new CSV file
df.to_csv('final_filtered_NLP_immoscout24.csv', index=False)

print("New CSV with enriched data has been saved.")


New CSV with enriched data has been saved.


NEW VARIABLE LUXUS

In [1]:
import pandas as pd
import spacy
from langdetect import detect, LangDetectException

# Load language models
nlp_de = spacy.load('de_core_news_sm')
nlp_fr = spacy.load('fr_core_news_sm')
nlp_it = spacy.load('it_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')

# Keywords for each feature in multiple languages
keywords = {
    'terrace': {
        'de': ['terrasse', 'balkon', 'dachterrasse', 'veranda', 'freiluftbereich', 'attika', 'attikawohnung', 'loggia', 'sonnenterrasse', 'außenbereich', 'terrassenwohnung', 'dachgarten', 'terrassengarten', 'sonnendeck'],
        'fr': ['terrasse', 'balcon', 'toit-terrasse', 'véranda', 'espace extérieur', 'attique', 'terrassement', 'terrasses', 'balcons'],
        'it': ['terrazza', 'terrazzo', 'terrazzato', 'balcone', 'balconata', 'veranda', 'area esterna', 'attico', 'attica', 'solarium'],
        'en': ['terrace', 'balcony', 'roof terrace', 'veranda', 'outdoor area', 'penthouse', 'deck', 'patio']
    },
    'garden': {
        'de': ['garten', 'grünfläche', 'hof', 'park', 'rasenfläche', 'gartenbereich', 'gartenanlage', 'privatgarten', 'gemeinschaftsgarten', 'parkanlage', 'hofgarten', 'gartenteich', 'gartenlandschaft', 'Gartenanteil'],
        'fr': ['jardin', 'espace vert', 'cour', 'parc', 'pelouse', 'jardinet', 'parterre', 'verdure'],
        'it': ['giardino', 'area verde', 'cortile', 'parco', 'prato', 'giardinetto', 'orto', 'verde pubblico', 'parterre'],
        'en': ['garden', 'green space', 'yard', 'park', 'lawn', 'backyard', 'courtyard', 'garden area', 'greensward']
    },
    'new': {
        'de': ['neubau', 'neu gebaut', 'neu entwickelt', 'brandneu', 'neukonstruktion', 'bauprojekt', 'neubauprojekt', 'neubauten', 'erstbezug', 'neuerstellung', 'neueröffnung', 'fabrikneu', 'neubaugebiet'],
        'fr': ['neuf', 'construction récente', 'développé récemment', 'tout neuf', 'projet de construction', 'nouvelle construction', 'nouvellement construit', 'flambant neuf', 'nouvelle'],
        'it': ['nuova costruzione', 'nuovo', 'appena costruito', 'sviluppato di recente', 'nuovissimo', 'progetto edilizio', 'nuova edificazione', 'nuova'],
        'en': ['new build', 'newly built', 'recently developed', 'brand new', 'construction project', 'new development', 'newly constructed', 'newly renovated']
    },
    'renovated': {
        'de': ['renoviert', 'modernisiert', 'saniert', 'überholt', 'restauriert', 'renovierte', 'kernsaniert', 'refurbished', 'instandgesetzt', 'aufgefrischt', 'modernisierung', 'sanierung'],
        'fr': ['rénové', 'modernisé', 'assaini', 'refait', 'restauré', 'rénovation', 'modernisation', 'assainissement', 'refaite'],
        'it': ['ristrutturato', 'modernizzato', 'sistemato', 'rifatto', 'restaurato', 'rinnovato', 'ammodernato', 'aggiornato', 'ristrutturazione'],
        'en': ['renovated', 'modernized', 'refurbished', 'overhauled', 'restored', 'revamped', 'updated', 'renewed']
    },
    'old': {
        'de': ['altbau', 'alt', 'traditionell', 'historisch', 'erbstück', 'altbauflair', 'altbauwohnung', 'baudenkmal', 'denkmalschutz', 'ursprünglich', 'zeitlos', 'antik', 'alte bausubstanz'],
        'fr': ['ancien', 'traditionnel', 'historique', 'héritage', 'ancienne construction', 'vieille bâtisse', 'patrimoine', 'historiquement'],
        'it': ['vecchia costruzione', 'antico', 'tradizionale', 'storico', 'eredità', 'vecchio', 'antica', 'storica'],
        'en': ['old building', 'old', 'traditional', 'historical', 'heritage', 'antique', 'vintage', 'classic']
    },
    'view': {
        'de': ['aussicht', 'blick', 'panorama', 'sicht', 'ausblick', 'seesicht', 'fernblick', 'weitblick', 'bergblick', 'stadtblick', 'rundumblick', 'panoramablick'],
        'fr': ['vue', 'perspective', 'panorama', 'vue dégagée', 'panoramique', 'vue panoramique', 'vue sur la mer', 'vue montagne'],
        'it': ['vista', 'panorama', 'prospettiva', 'vista aperta', 'panoramica', 'panoramici', 'vista mare', 'vista montagna'],
        'en': ['view', 'sight', 'panorama', 'outlook', 'seaview', 'panoramic', 'scenic view', 'landscape view']
    },
    'city_center': {
        'de': ['stadtzentrum', 'zentral', 'mitte', 'kerngebiet', 'stadtmitte', 'zentrumsnah', 'innenstadt', 'stadtzentral', 'city', 'zentrumslage', 'innenstadtnah', 'stadtkern', 'stadtnah'],
        'fr': ['centre-ville', 'central', 'coeur de ville', 'centre urbain', 'hyper-centre', 'en plein centre', 'centrale', 'centre'],
        'it': ['centro città', 'centrale', 'cuore della città', 'centro urbano', 'centro', 'nel centro', 'cuore del centro', 'centrale', 'nucleo'],
        'en': ['city center', 'central', 'downtown', 'heart of the city', 'urban center', 'city core', 'town center', 'city centre']
    } ,  
    'luxus': {
        'de': ['luxus', 'villa', 'penthaus', 'attika', 'Attikawohnung', 'luxuriös', 'luxuriöse', 'luxuriöses', 'Attika-Wohnung'],
        'fr': ['luxe', 'villa', 'penthouse', 'attique', 'luxueux'],
        'it': ['lusso', 'villa', 'attico', 'lussuoso', 'lussuosa'],
        'en': ['luxury', 'villa', 'penthouse', 'luxurious']
    },
    
}

# Function to detect and process features in multilingual text
def detect_and_process(row):
    text = (row['Title'] + " " + row['Description']).lower()
    try:
        language = detect(text)
    except LangDetectException:
        language = 'de'  # Default to German if detection fails

    nlp = nlp_de if language == 'de' else nlp_fr if language == 'fr' else nlp_it if language == 'it' else nlp_en
    doc = nlp(text)

    # Initialize default values for boolean features
    row['city_center'] = False
    row['garden'] = False
    row['terrace'] = False
    row['view'] = False
    row['luxus'] = False 

    # Set default condition
    row['condition'] = 'old'

    for feature, lang_keywords in keywords.items():
        if language in lang_keywords:
            if feature in ['city_center', 'garden', 'terrace', 'view', 'luxus']:  # Including Luxus as a boolean feature
                row[feature] = any(token.text in lang_keywords[language] for token in doc)
            if feature in ['new', 'renovated', 'old'] and any(token.text in lang_keywords[language] for token in doc):
                row['condition'] = feature

    return row

# Read data from CSV
try:
    df = pd.read_csv('price_on_request_records.csv', sep=';')
except Exception as e:
    print(f"Error reading the CSV file: {e}")
    exit()

# Ensure descriptions and titles are strings and handle missing data
df['Description'] = df['Description'].fillna('').astype(str)
df['Title'] = df['Title'].fillna('').astype(str)

# Apply language-specific detection and feature extraction
df = df.apply(detect_and_process, axis=1)

# Save the enriched DataFrame to a new CSV file
df.to_csv('enriched_price_on_request_records.csv', index=False)

print("New CSV with enriched data has been saved.")


New CSV with enriched data has been saved.
