In [288]:
import pandas
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype
data_folder = "/home/aitor/Escritorio/TFM/Datos"
data_name = "data.csv"
data_path = "/".join((data_folder, data_name))
data = pandas.read_csv(data_path)

In [289]:
import json
questions_name = "questions.json"
questions_path = "/".join((data_folder, questions_name))
with open(questions_path) as json_file:
    questions = json.load(json_file)

In [290]:
# This function iterates over columns, not rows
def get_table_info(original_table, questions_template):
    new_table = original_table.copy()
    for question in new_table:
        # Check if the question is in the 'questions' dict
        if question in questions:
            #print("Respuesta correcta: {}".format(questions[question]["correct_answer"]))
            values = new_table[question]
            for index, value in values.iteritems():
                if value != "NS/NC":                   
                    new_table.at[index, question] = (value == questions_template[question]["correct_answer"])
        else:
            print("Error!")
            
    return new_table

In [291]:
all_questions_list = list(questions.keys())
myth_questions_list = [question for question, info in questions.items() if info["myth"]]
non_myth_questions_list = [question for question, info in questions.items() if not info["myth"]]

In [292]:
all_questions_data = data.filter(all_questions_list)
myth_questions_data = data.filter(myth_questions_list)
non_myth_questions_data = data.filter(non_myth_questions_list)

In [293]:
all_questions_info = get_table_info(all_questions_data, questions)
myth_questions_info = get_table_info(myth_questions_data, questions)
non_myth_questions_info = get_table_info(non_myth_questions_data, questions)

In [294]:
myth_questions_resume = myth_questions_info.apply(pandas.value_counts, axis=1).fillna(0)
non_myth_questions_resume = non_myth_questions_info.apply(pandas.value_counts, axis=1).fillna(0)

In [295]:
myth_questions_resume_percentage = myth_questions_resume.apply(lambda x: x * 100 / x.sum(), axis=1)
non_myth_questions_resume_percentage = non_myth_questions_resume.apply(lambda x: x * 100 / x.sum(), axis=1)

In [296]:
myth_questions_resume["No ha fallado (Mitos)"] = myth_questions_resume[True] + myth_questions_resume["NS/NC"]
non_myth_questions_resume["No ha fallado (General)"] = non_myth_questions_resume[True] + non_myth_questions_resume["NS/NC"]
myth_questions_resume_percentage["No ha fallado (Mitos)"] = myth_questions_resume_percentage[True] + myth_questions_resume_percentage["NS/NC"]
non_myth_questions_resume_percentage["No ha fallado (General)"] = non_myth_questions_resume_percentage[True] + non_myth_questions_resume_percentage["NS/NC"]

In [297]:
myth_questions_resume = myth_questions_resume.rename(columns={True: 'Ha acertado (Mitos)', False: 'Ha fallado (Mitos)', 'NS/NC': 'NS/NC (Mitos)'})
non_myth_questions_resume = non_myth_questions_resume.rename(columns={True: 'Ha acertado (General)', False: 'Ha fallado (General)', 'NS/NC': 'NS/NC (General)'})

In [298]:
data_simplified = data.drop(all_questions_list, axis=1).drop("Marca temporal", axis=1)

In [299]:
data_for_correlations = pandas.concat([data_simplified, non_myth_questions_resume, myth_questions_resume], axis=1)

In [300]:
def correlation_analysis(data, x_variable, y_variable, plot=True):
    if plot:
        plt.scatter(data[x_variable], data[y_variable], color='red')
        plt.title('{} Vs {}'.format(x_variable, y_variable), fontsize=14)
        plt.xlabel(x_variable, fontsize=14)
        plt.ylabel(y_variable, fontsize=14)
        plt.grid(True)
        #plt.show();
    
    correlation_value = data[x_variable].corr(data[y_variable])
    print("Correlation between '{}' and '{}' is {}".format(x_variable, y_variable, correlation_value))
    return correlation_value

In [301]:
#data_for_correlations.corr()

In [302]:
##### http://benalexkeen.com/mapping-categorical-data-in-pandas/
# age category (ordered)
age_category =  ['20-29', '30-39', '40-49', '50-59', '60 o más']
# sex category (unordered)
sex_category = ['Hombre', 'Mujer', 'Otro']
# teaching age category (ordered)
experience_category = ['1-2','3-5','6-15','16-25','26-35', '36 o más']
# vet category (unordered)
vet_category = ['Actividades Físicas y Deportivas', 'Administración y Gestión', 'Agraria', 'Artes gráficas', 'Artes y Artesanías','Comercio y Marketing','Edificación y Obra Civil','Electricidad y Electrónica','Energía y Agua','Fabricación mecánica','Hostelería y Turismo','Imagen Personal','Imagen y Sonido','Industrias Alimentarias','Industrias Extractivas','Informática y Comunicaciones','Instalación y Mantenimiento','Madera, Mueble y Corcho','Marítimo-Pesquera','Química','Sanidad','Seguridad y Medio Ambiente','Servicios Socioculturales y a la Comunidad','Textil, Confección y Piel','Transporte y Mantenimiento de Vehículos','Vidrio y Cerámica']
# general magazines category (unordered)
blogs_category = ['Sí', 'No']
# scientific magazines category (unordered)
divulgation_category = ['Sí', 'No']
# paperscategory (ordered)
papers_category = ['Ninguno', '1-2', '3-5', '6-10', 'Más de 10']

In [303]:
# Transform 'Edad (en años)' into an ordered categorical column
data_for_correlations_temp = data_for_correlations[data_for_correlations["Edad (en años)"] != "Prefiero no decirlo"]
#data_for_correlations_temp["Edad (en años)"] = data_for_correlations_temp["Edad (en años)"].astype(CategoricalDtype(categories=age_category, ordered=True)).cat.codes
#correlation_analysis(data_for_correlations_temp, "Edad (en años)", "Ha acertado (Mitos)")

In [304]:
# Transform sex into a categorical column and filter 'Prefiero no contestar' raws
data_for_correlations_temp = data_for_correlations[data_for_correlations["Sexo"] != "Prefiero no decirlo"]
data_for_correlations_temp = data_for_correlations_temp[data_for_correlations_temp["Sexo"] != "Otro"]
#data_for_correlations_temp["Sexo"] = data_for_correlations_temp["Sexo"].astype('category').cat.codes
#correlation_analysis(data_for_correlations_temp, "Sexo", "Ha acertado (Mitos)")

In [305]:
# Transform divulgation into a categorical column and filter 'Prefiero no contestar' raws
data_for_correlations_temp = data_for_correlations[data_for_correlations["¿Te interesa la divulgación científica?"] != "Prefiero no decirlo"]
#data_for_correlations_temp["¿Te interesa la divulgación científica?"] = data_for_correlations_temp["¿Te interesa la divulgación científica?"].astype('category').cat.codes
#correlation_analysis(data_for_correlations_temp, "¿Te interesa la divulgación científica?", "Ha acertado (Mitos)")

In [306]:
# Transform 'Experiencia docente (en años)' into an ordered categorical column
data_for_correlations_temp = data_for_correlations[data_for_correlations["Experiencia docente (en años)"] != "Prefiero no decirlo"]
#data_for_correlations_temp["Experiencia docente (en años)"] = data_for_correlations_temp["Experiencia docente (en años)"].astype(CategoricalDtype(categories=experience_category, ordered=True)).cat.codes
#correlation_analysis(data_for_correlations_temp, "Experiencia docente (en años)", "Ha acertado (Mitos)")

In [307]:
# Vamos a limpiar la tabla de valores no válidos: 'Prefiero no decirlo' y 'Otro'
import numpy as np
data_clear = data_for_correlations.replace(['Prefiero no decirlo', 'Otro'], np.NaN).dropna()

# Nombres de las columnas
age = "Edad (en años)"
sex = "Sexo"
experience = "Experiencia docente (en años)"
divulgation = "¿Te interesa la divulgación científica?"
blogs = "¿Lees revistas o blogs relacionados con la educación?"
papers = "¿Cuántos artículos científicos has leído en el último año?"
correct_general = "Ha acertado (General)"
correct_myths = "Ha acertado (Mitos)"

# Convertimos la categoría 'Edad' en numérica: Edad(en años)
data_clear[age] = data_clear[age].astype(CategoricalDtype(categories=age_category, ordered=True)).cat.codes

# Convertimos la categoría 'Sexo' en numérica: Sexo
data_clear[sex] = data_clear[sex].astype('category').cat.codes

# Convertimos la categoría 'Experiencia docente' en numérica: "Experiencia docente (en años)"
data_clear[experience] = data_clear[experience].astype(CategoricalDtype(categories=experience_category, ordered=True)).cat.codes

# Convertimos la categoría 'Divulgación científica' en numérica: ¿Te interesa la divulgación científica?
data_clear[divulgation] = data_clear[divulgation].astype('category').cat.codes

# Convertimos la categoría 'Revistas' en numérica: ¿Lees revistas o blogs relacionados con la educación?
data_clear[blogs] = data_clear[blogs].astype('category').cat.codes

# Convertimos la categoría 'Papers leídos' en numérica: ¿Cuántos artículos científicos has leído en el último año?
data_clear[papers] = data_clear[papers].astype(CategoricalDtype(categories=papers_category, ordered=True)).cat.codes

#data_clear

In [308]:
# Regresión lineal entre 'Ha acertado (Mitos)' y las variables edad, sexo, experiencia, divulgación, blogs y papers
independent_variable = data_clear[[age, sex, experience, divulgation, blogs, papers]]
dependent_variable = data_clear[correct_myths]
result = sm.OLS(dependent_variable, independent_variable).fit()
result.summary()

0,1,2,3
Dep. Variable:,Ha acertado (Mitos),R-squared:,0.756
Model:,OLS,Adj. R-squared:,0.746
Method:,Least Squares,F-statistic:,76.98
Date:,"Fri, 21 Jun 2019",Prob (F-statistic):,3.71e-43
Time:,22:43:42,Log-Likelihood:,-290.27
No. Observations:,155,AIC:,592.5
Df Residuals:,149,BIC:,610.8
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Edad (en años),0.1449,0.209,0.695,0.488,-0.267,0.557
Sexo,-0.0409,0.273,-0.150,0.881,-0.581,0.499
Experiencia docente (en años),-0.0411,0.156,-0.263,0.793,-0.350,0.268
¿Te interesa la divulgación científica?,1.9352,0.330,5.858,0.000,1.282,2.588
¿Lees revistas o blogs relacionados con la educación?,0.9623,0.294,3.278,0.001,0.382,1.542
¿Cuántos artículos científicos has leído en el último año?,0.0592,0.105,0.566,0.572,-0.147,0.266

0,1,2,3
Omnibus:,0.078,Durbin-Watson:,2.061
Prob(Omnibus):,0.962,Jarque-Bera (JB):,0.099
Skew:,0.051,Prob(JB):,0.952
Kurtosis:,2.929,Cond. No.,11.0


In [309]:
# Regresión lineal entre 'Ha acertado (General)' y las variables edad, sexo, experiencia, divulgación, blogs y papers
independent_variable = data_clear[[age, sex, experience, divulgation, blogs, papers]]
dependent_variable = data_clear[correct_general]
result = sm.OLS(dependent_variable, independent_variable).fit()
result.summary()

0,1,2,3
Dep. Variable:,Ha acertado (General),R-squared:,0.858
Model:,OLS,Adj. R-squared:,0.852
Method:,Least Squares,F-statistic:,150.2
Date:,"Fri, 21 Jun 2019",Prob (F-statistic):,1.42e-60
Time:,22:43:42,Log-Likelihood:,-425.58
No. Observations:,155,AIC:,863.2
Df Residuals:,149,BIC:,881.4
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Edad (en años),1.7660,0.499,3.538,0.001,0.780,2.752
Sexo,1.5666,0.655,2.393,0.018,0.273,2.860
Experiencia docente (en años),-0.2122,0.375,-0.567,0.572,-0.952,0.528
¿Te interesa la divulgación científica?,5.5956,0.791,7.076,0.000,4.033,7.158
¿Lees revistas o blogs relacionados con la educación?,0.9344,0.703,1.330,0.186,-0.454,2.323
¿Cuántos artículos científicos has leído en el último año?,0.0032,0.250,0.013,0.990,-0.491,0.498

0,1,2,3
Omnibus:,0.905,Durbin-Watson:,1.824
Prob(Omnibus):,0.636,Jarque-Bera (JB):,0.539
Skew:,0.084,Prob(JB):,0.764
Kurtosis:,3.235,Cond. No.,11.0


In [310]:
# Regresión lineal entre 'Ha acertado (Mitos)' y las variables edad, sexo, experiencia, divulgación, blogs, papers y Ha acertado (General)
independent_variable = data_clear[[age, sex, experience, divulgation, blogs, papers]]
dependent_variable = data_clear[correct_myths]
result = sm.OLS(dependent_variable, independent_variable).fit()
result.summary()

0,1,2,3
Dep. Variable:,Ha acertado (Mitos),R-squared:,0.756
Model:,OLS,Adj. R-squared:,0.746
Method:,Least Squares,F-statistic:,76.98
Date:,"Fri, 21 Jun 2019",Prob (F-statistic):,3.71e-43
Time:,22:43:43,Log-Likelihood:,-290.27
No. Observations:,155,AIC:,592.5
Df Residuals:,149,BIC:,610.8
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Edad (en años),0.1449,0.209,0.695,0.488,-0.267,0.557
Sexo,-0.0409,0.273,-0.150,0.881,-0.581,0.499
Experiencia docente (en años),-0.0411,0.156,-0.263,0.793,-0.350,0.268
¿Te interesa la divulgación científica?,1.9352,0.330,5.858,0.000,1.282,2.588
¿Lees revistas o blogs relacionados con la educación?,0.9623,0.294,3.278,0.001,0.382,1.542
¿Cuántos artículos científicos has leído en el último año?,0.0592,0.105,0.566,0.572,-0.147,0.266

0,1,2,3
Omnibus:,0.078,Durbin-Watson:,2.061
Prob(Omnibus):,0.962,Jarque-Bera (JB):,0.099
Skew:,0.051,Prob(JB):,0.952
Kurtosis:,2.929,Cond. No.,11.0


In [311]:
# Regresión lineal entre 'Ha acertado (Mitos)' y la variable 'Ha acertado (General)'
data_clear['const'] = 1
independent_variable = data_clear[[correct_general, 'const']]
dependent_variable = data_clear[correct_myths]
result = sm.OLS(dependent_variable, independent_variable).fit()
result.summary()

0,1,2,3
Dep. Variable:,Ha acertado (Mitos),R-squared:,0.145
Model:,OLS,Adj. R-squared:,0.14
Method:,Least Squares,F-statistic:,26.0
Date:,"Fri, 21 Jun 2019",Prob (F-statistic):,9.98e-07
Time:,22:43:44,Log-Likelihood:,-258.28
No. Observations:,155,AIC:,520.6
Df Residuals:,153,BIC:,526.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Ha acertado (General),0.1744,0.034,5.099,0.000,0.107,0.242
const,1.2082,0.342,3.531,0.001,0.532,1.884

0,1,2,3
Omnibus:,0.174,Durbin-Watson:,2.132
Prob(Omnibus):,0.917,Jarque-Bera (JB):,0.076
Skew:,-0.054,Prob(JB):,0.963
Kurtosis:,3.014,Cond. No.,33.4
