<a href="https://colab.research.google.com/github/braulioalda4-code/nuevo_repositorio/blob/main/py_Braulio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Generar dataset de ejemplo
def generar_dataset_harvard():
    np.random.seed(42)

    # Fechas de ejemplo (√∫ltimos 2 a√±os)
    fechas = [datetime.now() - timedelta(days=x) for x in range(0, 730, 10)]

    # T√≠tulos de rese√±as
    titulos_positivos = [
        "Amazing campus experience", "Beautiful architecture", "Inspiring place",
        "Great student tour", "Historic and impressive", "Must visit in Boston",
        "Wonderful libraries", "Fantastic museums", "Dream come true"
    ]

    titulos_negativos = [
        "Overrated experience", "Too crowded", "Disappointing tour",
        "Not worth the hype", "Underwhelming", "Closed buildings"
    ]

    titulos_neutrales = [
        "Nice walk around campus", "Interesting history", "Good for photos",
        "Standard university tour", "Decent experience"
    ]

    todos_titulos = titulos_positivos + titulos_negativos + titulos_neutrales

    # Textos de ejemplo
    textos = [
        "The campus was absolutely stunning with beautiful historic buildings and green spaces.",
        "The student-led tour was informative and engaging. Our guide was very knowledgeable.",
        "Expected more from such a prestigious university. Felt like any other campus.",
        "The libraries and museums are world-class. Definitely worth spending a full day here.",
        "Too many tourists, hard to appreciate the academic atmosphere.",
        "The architecture is incredible, especially the older buildings from the colonial era.",
        "Great place to walk around and feel the history. Don't miss the Harvard Yard.",
        "Some buildings were closed to visitors which was disappointing.",
        "The Harvard Square area has great restaurants and shops to explore after the tour.",
        "An inspiring place that makes you appreciate the pursuit of knowledge."
    ]

    data = []
    for i, fecha in enumerate(fechas):
        # Generar rating (distribuci√≥n sesgada hacia positivo)
        if i % 10 < 7:  # 70% positivos
            rating = np.random.choice([4, 5])
            titulo = np.random.choice(titulos_positivos)
        elif i % 10 < 9:  # 20% neutrales
            rating = 3
            titulo = np.random.choice(titulos_neutrales)
        else:  # 10% negativos
            rating = np.random.choice([1, 2])
            titulo = np.random.choice(titulos_negativos)

        # Plataforma
        plataforma = np.random.choice(['Desktop', 'Mobile'], p=[0.6, 0.4])

        # Votos √∫tiles (la mayor√≠a 0, algunos con votos)
        helpful_votes = 0 if np.random.random() > 0.2 else np.random.randint(1, 10)

        data.append({
            'published_date': fecha.strftime('%Y-%m-%dT%H:%M:%S-05:00'),
            'published_platform': plataforma,
            'rating': rating,
            'type': 'review',
            'helpful_votes': helpful_votes,
            'title': titulo,
            'text': np.random.choice(textos)
        })

    return pd.DataFrame(data)

# Crear dataset
print("üîÑ Generando dataset de rese√±as de Harvard...")
df = generar_dataset_harvard()
print(f"‚úÖ Dataset generado: {df.shape[0]} rese√±as creadas\n")

# Mostrar informaci√≥n b√°sica del dataset
print("=== INFORMACI√ìN B√ÅSICA DEL DATASET ===")
print(f"Dimensiones del dataset: {df.shape}")
print(f"\nPrimeras 5 filas:")
print(df.head().to_string())

print(f"\nColumnas disponibles: {df.columns.tolist()}")

# HISTORIA 1: USANDO ILOC (index-based selection)
print("\n" + "="*60)
print("HISTORIA 1: AN√ÅLISIS CON ILOC")
print("="*60)

# Obtener las primeras 10 rese√±as usando iloc
print("\nüìñ Las primeras 10 rese√±as del dataset:")
primeras_10 = df.iloc[0:10]
for i in range(len(primeras_10)):
    fila = primeras_10.iloc[i]
    print(f"{i+1}. [{fila['rating']}‚≠ê] {fila['title']} - {fila['published_date'][:10]}")

# Obtener rese√±as espec√≠ficas por posici√≥n
print("\nüéØ Rese√±as en posiciones estrat√©gicas:")
posiciones = [0, 10, 20, 30, 40, -1]
rese√±as_estrategicas = df.iloc[posiciones]
for idx, row in rese√±as_estrategicas.iterrows():
    print(f"Posici√≥n {idx}: [{row['rating']}‚≠ê] '{row['title']}'")

# Obtener columnas espec√≠ficas por posici√≥n
print("\nüìä Columnas espec√≠ficas por posici√≥n num√©rica:")
# Columnas: 0:published_date, 2:rating, 5:title, 6:text
datos_esenciales = df.iloc[:, [0, 2, 5, 6]].head(3)
print(datos_esenciales.to_string())

# HISTORIA 2: USANDO LOC (label-based selection)
print("\n" + "="*60)
print("HISTORIA 2: AN√ÅLISIS CON LOC")
print("="*60)

# Convertir tipos de datos para an√°lisis
df['published_date'] = pd.to_datetime(df['published_date'])
df['rating'] = pd.to_numeric(df['rating'])

# Encontrar todas las rese√±as de 5 estrellas
print("\n‚≠ê Rese√±as de 5 estrellas:")
cinco_estrellas = df.loc[df['rating'] == 5]
print(f"Total de rese√±as de 5 estrellas: {len(cinco_estrellas)}")

if len(cinco_estrellas) > 0:
    print("\nPrimeras 3 rese√±as de 5 estrellas:")
    muestras = cinco_estrellas.head(3)
    for i, (_, row) in enumerate(muestras.iterrows(), 1):
        print(f"{i}. {row['title']}")
        print(f"   Texto: {row['text'][:80]}...")

# Rese√±as con votos √∫tiles
print("\nüëç Rese√±as con votos √∫tiles:")
votos_utiles = df.loc[df['helpful_votes'] > 0]
print(f"Total rese√±as con votos √∫tiles: {len(votos_utiles)}")

if len(votos_utiles) > 0:
    votos_ordenados = votos_utiles.sort_values('helpful_votes', ascending=False).head(3)
    print("Top 3 rese√±as m√°s √∫tiles:")
    for _, row in votos_ordenados.iterrows():
        print(f"   [{row['helpful_votes']} votos] '{row['title']}' - Rating: {row['rating']}‚≠ê")

# Rese√±as desde plataforma m√≥vil con rating alto
print("\nüì± Mejores rese√±as desde m√≥vil:")
mobile_excelentes = df.loc[(df['published_platform'] == 'Mobile') & (df['rating'] >= 4)]
print(f"Rese√±as excelentes desde m√≥vil: {len(mobile_excelentes)}")

if len(mobile_excelentes) > 0:
    muestras_mobile = mobile_excelentes.head(2)
    for _, row in muestras_mobile.iterrows():
        print(f"   [{row['rating']}‚≠ê] {row['title']}")
        print(f"   {row['text'][:60]}...")

# HISTORIA 3: COMBINANDO ILOC Y LOC
print("\n" + "="*60)
print("HISTORIA 3: COMBINANDO ILOC Y LOC")
print("="*60)

# Primero usar loc para filtrar, luego iloc para seleccionar posiciones espec√≠ficas
print("\nüìâ An√°lisis combinado: rese√±as negativas recientes")

# Extraer a√±o para filtrado
df['year'] = df['published_date'].dt.year

# Filtrar rese√±as de 2023-2024 con rating bajo usando loc
a√±o_actual = datetime.now().year
rese√±as_negativas_recientes = df.loc[(df['year'] >= 2023) & (df['rating'] <= 2)]

print(f"Rese√±as negativas recientes: {len(rese√±as_negativas_recientes)}")

# Usar iloc para obtener muestras espec√≠ficas de estas rese√±as filtradas
if len(rese√±as_negativas_recientes) > 0:
    step = max(1, len(rese√±as_negativas_recientes) // 3)
    indices_muestras = list(range(0, len(rese√±as_negativas_recientes), step))[:3]
    muestras_negativas = rese√±as_negativas_recientes.iloc[indices_muestras]

    print("\nMuestras de rese√±as negativas recientes:")
    for _, row in muestras_negativas.iterrows():
        print(f"   [{row['rating']}‚≠ê] {row['title']}")
        print(f"   Fecha: {row['published_date'].strftime('%Y-%m-%d')}")
        print(f"   Plataforma: {row['published_platform']}")
        print(f"   Texto: {row['text'][:100]}...\n")

# HISTORIA 4: AN√ÅLISIS TEMPORAL
print("\n" + "="*60)
print("HISTORIA 4: EVOLUCI√ìN TEMPORAL DE RESE√ëAS")
print("="*60)

# Usar loc para filtrar por a√±os recientes
rese√±as_recientes = df.loc[df['year'] >= 2022]

# Agrupar por a√±o y contar rese√±as
conteo_anual = rese√±as_recientes.groupby('year').size()
print("\nüìà Cantidad de rese√±as por a√±o:")
for a√±o, cantidad in conteo_anual.items():
    print(f"   {a√±o}: {cantidad} rese√±as")

# Rating promedio por a√±o usando loc para filtrado y luego c√°lculos
print("\nüìä Rating promedio por a√±o:")
for a√±o in sorted(rese√±as_recientes['year'].unique()):
    rese√±as_a√±o = rese√±as_recientes.loc[rese√±as_recientes['year'] == a√±o]
    rating_promedio = rese√±as_a√±o['rating'].mean()
    print(f"   {a√±o}: {rating_promedio:.2f}‚≠ê")

# HISTORIA 5: COMPARATIVA ENTRE PLATAFORMAS
print("\n" + "="*60)
print("HISTORIA 5: COMPARATIVA ENTRE PLATAFORMAS")
print("="*60)

# Usar loc para separar por plataforma
desktop_reviews = df.loc[df['published_platform'] == 'Desktop']
mobile_reviews = df.loc[df['published_platform'] == 'Mobile']

print(f"\nüíª Total rese√±as Desktop: {len(desktop_reviews)}")
print(f"üì± Total rese√±as Mobile: {len(mobile_reviews)}")

# Rating promedio por plataforma
if len(desktop_reviews) > 0:
    rating_desktop = desktop_reviews['rating'].mean()
    print(f"‚≠ê Rating promedio Desktop: {rating_desktop:.2f}")

if len(mobile_reviews) > 0:
    rating_mobile = mobile_reviews['rating'].mean()
    print(f"‚≠ê Rating promedio Mobile: {rating_mobile:.2f}")

# Muestras representativas de cada plataforma usando iloc
print("\nüîç Muestras representativas por plataforma:")
print("Desktop:")
if len(desktop_reviews) > 0:
    for i in range(min(2, len(desktop_reviews))):
        muestra = desktop_reviews.iloc[i]
        print(f"   [{muestra['rating']}‚≠ê] {muestra['title'][:50]}...")

print("\nMobile:")
if len(mobile_reviews) > 0:
    for i in range(min(2, len(mobile_reviews))):
        muestra = mobile_reviews.iloc[i]
        print(f"   [{muestra['rating']}‚≠ê] {muestra['title'][:50]}...")

# RESUMEN FINAL
print("\n" + "="*60)
print("RESUMEN EJECUTIVO")
print("="*60)

print(f"üìä Dataset completo: {df.shape[0]} rese√±as")
print(f"‚≠ê Rating promedio general: {df['rating'].mean():.2f}")
print(f"üìÖ Per√≠odo cubierto: {df['published_date'].min().strftime('%Y-%m-%d')} a {df['published_date'].max().strftime('%Y-%m-%d')}")

# Distribuci√≥n por plataforma
total = len(df)
if total > 0:
    print(f"üì± Distribuci√≥n por plataforma:")
    print(f"   - Desktop: {len(desktop_reviews)} rese√±as ({(len(desktop_reviews)/total)*100:.1f}%)")
    print(f"   - Mobile: {len(mobile_reviews)} rese√±as ({(len(mobile_reviews)/total)*100:.1f}%)")

# Distribuci√≥n de ratings
print(f"\nüéØ Distribuci√≥n de ratings:")
distribucion_ratings = df['rating'].value_counts().sort_index()
for rating, cantidad in distribucion_ratings.items():
    porcentaje = (cantidad / total) * 100
    print(f"   {rating:.0f}‚≠ê: {cantidad} rese√±as ({porcentaje:.1f}%)")

# DATASETS INTERMEDIOS GENERADOS
print("\n" + "="*60)
print("DATASETS INTERMEDIOS GENERADOS")
print("="*60)

print(f"1. Dataset principal: {df.shape}")
print(f"2. Rese√±as 5 estrellas: {cinco_estrellas.shape}")
print(f"3. Rese√±as con votos √∫tiles: {votos_utiles.shape}")
print(f"4. Rese√±as m√≥viles excelentes: {mobile_excelentes.shape}")
print(f"5. Rese√±as desktop: {desktop_reviews.shape}")
print(f"6. Rese√±as m√≥viles: {mobile_reviews.shape}")

# Guardar datasets para referencia
df.to_csv('harvard_reviews_generated.csv', index=False)
print(f"\nüíæ Dataset guardado como 'harvard_reviews_generated.csv'")

print("\n" + "="*60)
print("AN√ÅLISIS COMPLETADO EXITOSAMENTE! üéì")
print("="*60)

üîÑ Generando dataset de rese√±as de Harvard...
‚úÖ Dataset generado: 73 rese√±as creadas

=== INFORMACI√ìN B√ÅSICA DEL DATASET ===
Dimensiones del dataset: (73, 7)

Primeras 5 filas:
              published_date published_platform  rating    type  helpful_votes                 title                                                                                    text
0  2025-10-12T21:58:36-05:00             Mobile       4  review              0    Great student tour                          Too many tourists, hard to appreciate the academic atmosphere.
1  2025-10-02T21:58:36-05:00            Desktop       4  review              0       Inspiring place                          Too many tourists, hard to appreciate the academic atmosphere.
2  2025-09-22T21:58:36-05:00             Mobile       5  review              2     Fantastic museums                         Some buildings were closed to visitors which was disappointing.
3  2025-09-12T21:58:36-05:00            Desktop       5  re